In [133]:
import pandas as pd
import os
import numpy as np
import regex as re

In [5]:
years = ['2016', '2017', '2018', '2019']

In [375]:
schools = {}

for year in years:
    arr = []
    for fp in os.listdir(f'raw/{year}/team_stats'):
        arr.append(os.path.splitext(fp)[0])
    
    schools[year] = arr

In [149]:
output_dir = 'processed/'

In [179]:
def get_opp_team(desc):
    for team in desc.split('@'):
        if team.strip() in schools:
            return team.strip()

    return np.nan

In [181]:
def reformat_win(desc):
    if desc[0] == 'W':
        return 1
    else:
        return 0

In [216]:
def convert_height(height):
        try:
            feet, inches = height.split('-')
            return int(feet) * 12 + int(inches)
        except ValueError:
            return None

In [377]:
team_data = {}
params = ['Kills', 'Errors', 'Total Attacks', 'Hit Pct', 'Assists', 'Aces',
       'SErr', 'Digs', 'RErr', 'Block Solos', 'Block Assists', 'BErr', 'PTS',
       'BHE']
       
for year in years:
    year_data = {}
    for fp in os.listdir(f'raw/{year}/team_stats'):
        s = os.path.splitext(fp)[0].strip()
        team_df = pd.read_csv(f'raw/{year}/team_stats/{fp}')

        year_data[s] = {}

        for param in params:
            year_data[s][f'Avg {param}'] = round(team_df.loc[team_df[param] != '-'][param].mean(), 2)
            year_data[s][f'Avg Height'] = round(team_df.loc[team_df['Ht'] != '-']['Ht'].apply(convert_height).mean(), 2)
    team_data[year] = year_data

In [374]:
team_data['2016'].keys() == team_data['2017'].keys()

False

In [325]:
def get_team_b_data(team_b):
    return team_data[team_b]

In [326]:
feature_cols = []
for param in params:
    feature_cols.append(f'Team A {param}')
    feature_cols.append(f'Team B {param}')

In [384]:
for year in years:
    for fp in os.listdir(f'raw/{year}/team_game_by_game'):
        s = re.sub(r'\([^)]*\)', '', os.path.splitext(fp)[0]).strip()
        df = pd.read_csv(f'raw/{year}/team_game_by_game/{fp}')

        df.columns = df.iloc[0].values
        df = df[1:]
        df.reset_index(inplace=True)
        df.drop(['index'], axis = 1, inplace=True)
        df = pd.DataFrame(df, columns=df.columns[:-1])

        def get_opp_team(desc):
            for team in desc.split('@'):
                if team.strip() in schools[year]:
                    return team.strip()

            return np.nan

        df['Opp'] = df['Opponent'].apply(get_opp_team)

        df.dropna(subset=['Opp'], inplace=True)
        df.fillna(0, inplace=True)
        df.rename({'S': 'Sets'}, axis=1, inplace=True)
        df['Result'] = df['Result'].apply(reformat_win)

        for param in params:
            df[f'Team A {param}'] = round(team_data[year][s][f'Avg {param}'], 2)
            df[f'Team B {param}'] = round(df['Opp'].apply(lambda x: team_data[year][x][f'Avg {param}']), 2)
        
        df.drop(['Date', 'MP', "Opponent", 'Opp'], axis=1, inplace=True)
        
        cols = df.columns.to_list()
        cols = cols[16:] + cols[0:16]

        df = df[cols]

        df.to_csv(f'{output_dir}/{year}/{s}.csv', index=False)