In [1]:
import re

import numpy as np
import pandas as pd

In [2]:
players = pd.read_csv('../data/data.csv', encoding='utf-8')
league_team = pd.read_csv('../data/league_team_mapping.csv', encoding='utf-8')
league_rev = pd.read_csv('../data/league_country_level_revenue_mapping.csv', encoding='utf-8')

In [3]:
league = pd.merge(league_team, league_rev, on='League', how='left')
league.columns = [f'League {col}' for col in league.columns]

In [4]:
players = pd.merge(players, league, left_on='Club', right_on='League Team', how='left')

In [5]:
drop_cols = ['Unnamed: 0', 'ID', 'Name', 'Photo', 'Flag', 'Club Logo', 'Jersey Number', 'Joined', 'League Team', 'League League', 'Body Type']
players = players.drop(drop_cols, axis=1, errors='ignore')

In [6]:
def transform_money(df, colnames):
    df_copy = df.copy()
    for col in colnames:
        df_copy[col] = df_copy[col].apply(parse_money)
    return df_copy

def parse_money(x):
    if pd.isnull(x):
        return x
    m = re.search(r'€(\d+\.?\d*)(\w?)', x)
    value = float(m.group(1))
    unit = m.group(2)
    if unit == 'K':
        return value / 1000
    if unit == 'M':
        return value
    return value / 1000000

In [7]:
def del_no_value(df):
    df_copy = df.copy()
    return df.loc[df.Value > 0]

In [8]:
def transform_height(df, colnames):
    df_copy = df.copy()
    for col in colnames:
        df_copy[col] = df_copy[col].apply(parse_height)
    return df_copy

def parse_height(x):
    if pd.isnull(x):
        return x
    m = re.search(r'(\d+)\'(\d+)', x)
    return int(m.group(1))*12 + int(m.group(2))

In [9]:
def transform_weight(df, colnames):
    df_copy = df.copy()
    for col in colnames:
        df_copy[col] = df_copy[col].apply(parse_weight)
    return df_copy

def parse_weight(x):
    if pd.isnull(x):
        return x
    m = re.search(r'(\d+)lbs', x)
    return int(m.group(1))

In [10]:
def transform_pos_ratings(df, colnames):
    df_copy = df.copy()
    for col in colnames:
        df_copy[col] = df_copy[col].apply(parse_pos_ratings)
    return df_copy

def parse_pos_ratings(x):
    if pd.isnull(x):
        return x
    m = re.search(r'(\d+)\+\d', x)
    return int(m.group(1))

In [11]:
def transform_loaned(df):
    df_copy = df.copy()
    loaned_players_idx = ~df_copy['Loaned From'].isna()
    df_copy.loc[loaned_players_idx, 'Club'] = df_copy.loc[loaned_players_idx, 'Loaned From']
    df_copy.loc[loaned_players_idx, 'Contract Valid Until'] = df_copy.loc[loaned_players_idx, 'Contract Valid Until'].str.slice(-4)
    df_copy['Loaned Out'] = False
    df_copy.loc[loaned_players_idx, 'Loaned Out'] = True
    df_copy = df_copy.drop('Loaned From', axis=1)
    return df_copy

In [12]:
def transform_work_rate(df):
    df_copy = df.copy()
    df_copy['Work Rate'] = df_copy['Work Rate'].str.split('/')
    df_copy['Off Work Rate'] = df_copy['Work Rate'].str[0]
    df_copy['Def Work Rate'] = df_copy['Work Rate'].str[1]
    df_copy = df_copy.drop('Work Rate', axis=1)
    return df_copy

In [13]:
players = transform_money(players, ['Value', 'Release Clause'])
players = del_no_value(players)
players = transform_weight(players, ['Weight'])
players = transform_height(players, ['Height'])
players = transform_pos_ratings(players, 'LS,ST,RS,LW,LF,CF,RF,RW,LAM,CAM,RAM,LM,LCM,CM,RCM,RM,LWB,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB'.split(','))
players = transform_loaned(players)
players = transform_work_rate(players)

In [14]:
players.columns = [col.replace(' ', '_') for col in players.columns]

In [15]:
players.to_csv('../data/data_cleaned.csv', index=False)