### Import and Settings

In [100]:
import pandas as pd

pd.set_option('display.max_rows', 500)

### Read in files and reformat

In [119]:
ustat = pd.read_csv("csv files/ustat_2023.csv")
futbin = pd.read_csv("csv files/futbin_2023.csv")

In [120]:
futbin = futbin.rename(columns={"name": "player_name", "club": "team_title"})
futbin['dob'] = pd.to_datetime(futbin['dob'], dayfirst=True)

In [121]:
ustat.team_title = ustat.team_title.str.split(',')
ustat = ustat.explode('team_title').reset_index(drop=True)

## Dataset matching

* Utilizes the club and players ids to match the understat and futbin datasets
* Remove duplicates and missing data from result then rename columns to match database

In [122]:
unpack_ex = pd.read_csv("data repair csvs/fix_club_ids.csv")
unpack_ex = unpack_ex.rename(columns={"db_id": "db_club_id"})
ustat = ustat.merge(unpack_ex[['us_team', 'db_club_id']], how='left', left_on='team_title', right_on='us_team')
futbin = futbin.merge(unpack_ex[['futbin_id', 'db_club_id']], how='left', left_on='club_id', right_on='futbin_id')

In [123]:
unpack_nam = pd.read_csv("data repair csvs/fix_name_ids.csv")
unpack_nam = unpack_nam.drop(['futbin_name', 'score'], axis=1)
unpack_nam = unpack_nam.rename(columns={"id": "player_id", "ustat_name": "player_name"})
ustat = ustat.merge(unpack_nam, how='left', on='player_name')

In [124]:
comb = pd.merge(futbin, ustat, how='left', on=['player_id', 'db_club_id'])

In [125]:
cleaned = comb[comb.shots.notna()]
cleaned = cleaned.drop_duplicates()
cleaned = cleaned.drop('us_team', axis=1)
cleaned = cleaned.rename(columns={"player_name_x": "player_name_futbin", "team_title_x": "club_name_futbin",
                                 "player_name_y": "player_name_ustat", "team_title_y": "club_name_ustat",
                                 "id": "player_id_ustat", "club_id": "club_id_futbin", 'pos': 'position',
                                 'db_club_id': 'club_id', 'year':'season', 'league_id': 'comp_id', 'games':'apps',
                                 'time':'minutes'})

In [126]:
# To have correct null value in the database
cleaned = cleaned.fillna('NULL')

In [127]:
# Export complete file into a csv
cleaned.to_csv("csv files/cleaned_2023.csv",index=False)