In [1]:
import re

import numpy as np
import pandas as pd

In [2]:
players = pd.read_csv('../data/data.csv', encoding='utf-8')
drop_cols = ['Unnamed: 0', 'ID', 'Name', 'Photo', 'Flag', 'Club Logo', 'Jersey Number', 'Joined']
players = players.drop(drop_cols, axis=1, errors='ignore')

In [3]:
def transform_money(df, colnames):
    df_copy = df.copy()
    for col in colnames:
        df_copy[col] = df_copy[col].apply(parse_money)
    return df_copy

def parse_money(x):
    if pd.isnull(x):
        return x
    m = re.search(r'€(\d+\.?\d*)(\w?)', x)
    value = float(m.group(1))
    unit = m.group(2)
    if unit == 'K':
        return value*1000
    if unit == 'M':
        return value*1000000
    return value

In [4]:
def transform_height(df, colnames):
    df_copy = df.copy()
    for col in colnames:
        df_copy[col] = df_copy[col].apply(parse_height)
    return df_copy

def parse_height(x):
    if pd.isnull(x):
        return x
    m = re.search(r'(\d+)\'(\d+)', x)
    return int(m.group(1))*12 + int(m.group(2))

In [5]:
def transform_weight(df, colnames):
    df_copy = df.copy()
    for col in colnames:
        df_copy[col] = df_copy[col].apply(parse_weight)
    return df_copy

def parse_weight(x):
    if pd.isnull(x):
        return x
    m = re.search(r'(\d+)lbs', x)
    return int(m.group(1))

In [6]:
def transform_pos_ratings(df, colnames):
    df_copy = df.copy()
    for col in colnames:
        df_copy[col] = df_copy[col].apply(parse_pos_ratings)
    return df_copy

def parse_pos_ratings(x):
    if pd.isnull(x):
        return x
    m = re.search(r'(\d+)\+\d', x)
    return int(m.group(1))

In [7]:
def transform_loaned(df):
    df_copy = df.copy()
    loaned_players_idx = ~df_copy['Loaned From'].isna()
    df_copy.loc[loaned_players_idx, 'Club'] = df_copy.loc[loaned_players_idx, 'Loaned From']
    df_copy.loc[loaned_players_idx, 'Contract Valid Until'] = df_copy.loc[loaned_players_idx, 'Contract Valid Until'].str.slice(-4)
    df_copy['Loaned Out'] = False
    df_copy.loc[loaned_players_idx, 'Loaned Out'] = True
    df_copy = df_copy.drop('Loaned From', axis = 1)
    return df_copy

In [8]:
players = transform_money(players, ['Value', 'Wage', 'Release Clause'])
players = transform_weight(players, ['Weight'])
players = transform_height(players, ['Height'])
players = transform_pos_ratings(players, 'LS,ST,RS,LW,LF,CF,RF,RW,LAM,CAM,RAM,LM,LCM,CM,RCM,RM,LWB,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB'.split(','))
players = transform_loaned(players)

In [9]:
players

Unnamed: 0,Age,Nationality,Overall,Potential,Club,Value,Wage,Special,Preferred Foot,International Reputation,...,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause,Loaned Out
0,31,Argentina,94,94,FC Barcelona,110500000.0,565000.0,2202,Left,5.0,...,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,226500000.0,False
1,33,Portugal,94,94,Juventus,77000000.0,405000.0,2228,Right,5.0,...,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,127100000.0,False
2,26,Brazil,92,93,Paris Saint-Germain,118500000.0,290000.0,2143,Right,5.0,...,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,228100000.0,False
3,27,Spain,91,93,Manchester United,72000000.0,260000.0,1471,Right,4.0,...,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,138600000.0,False
4,27,Belgium,91,92,Manchester City,102000000.0,355000.0,2281,Right,4.0,...,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,196400000.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18202,19,England,47,65,Crewe Alexandra,60000.0,1000.0,1307,Right,1.0,...,40.0,48.0,47.0,10.0,13.0,7.0,8.0,9.0,143000.0,False
18203,19,Sweden,47,63,Trelleborgs FF,60000.0,1000.0,1098,Right,1.0,...,22.0,15.0,19.0,10.0,9.0,9.0,5.0,12.0,113000.0,False
18204,16,England,47,67,Cambridge United,60000.0,1000.0,1189,Right,1.0,...,32.0,13.0,11.0,6.0,5.0,10.0,6.0,13.0,165000.0,False
18205,17,England,47,66,Tranmere Rovers,60000.0,1000.0,1228,Right,1.0,...,20.0,25.0,27.0,14.0,6.0,14.0,8.0,9.0,143000.0,False


In [10]:
players.to_csv('../data/data_cleaned.csv', index=False)