# Imports and Settings 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

# Data Wrangling

In [2]:
df = pd.read_json('../../data/json/players_stats.json', orient='records', encoding='utf-8')
print(df.shape)
df.head(10)

(18055, 59)


Unnamed: 0,Acceleration,Aggression,Agility,Balance,BallControl,Body Type,Composure,Crossing,Curve,Dislike,...,Work Rate,age,birth_date,height,name,photo_url,player_hashtags,player_traits,positions,weight
0,92,48,90,95,96,Messi,97,77,90,197,...,Medium / Medium,30,1987/Jun/24,170,Lionel Messi,https://cdn.sofifa.org/1x/18/players/158023.png,"#Dribbler,#FK Specialist,#Acrobat,#Clinical Fi...","Finesse Shot,Long Shot Taker,Speed Dribbler,Pl...","CF,ST,RW",72
1,89,63,89,63,93,C. Ronaldo,95,85,81,260,...,High / Low,32,1985/Feb/5,187,C. Ronaldo dos Santos Aveiro,https://cdn.sofifa.org/1x/18/players/20801.png,"#Speedster,#Dribbler,#Distance Shooter,#Acroba...","Power Free Kick,Diver,Flair,Long Shot Taker,Sp...","LW,ST",83
2,94,56,96,82,95,Neymar,92,75,82,74,...,High / Medium,25,1992/Feb/5,175,Neymar da Silva Santos Jr.,https://cdn.sofifa.org/1x/18/players/190871.png,"#Speedster,#Dribbler,#Acrobat","Diver,Flair,Speed Dribbler,Technical Dribbler,...",LW,68
3,88,87,86,78,91,Normal,85,77,86,48,...,High / Medium,30,1987/Jan/24,182,Luis Suárez,https://cdn.sofifa.org/1x/18/players/176580.png,"#Dribbler,#Acrobat,#Clinical Finisher","Diver,Tries To Beat Defensive Line,Outside Foo...",ST,86
4,56,29,51,35,48,Normal,70,15,14,11,...,Medium / Medium,31,1986/Mar/27,193,Manuel Neuer,https://cdn.sofifa.org/1x/18/players/167495.png,,"GK Long Throw,GK 1,on,1 rush,GK Rushes Out of ...",GK,92
5,57,38,60,43,42,Lean,64,17,21,15,...,Medium / Medium,26,1990/Nov/7,193,David De Gea Quintana,https://cdn.sofifa.org/1x/18/players/193080.png,,"GK Long Throw,GK Saves With Feet",GK,76
6,78,73,82,79,90,Normal,86,92,85,17,...,High / High,26,1991/Jun/28,181,Kevin De Bruyne,https://cdn.sofifa.org/1x/18/players/192985.png,"#Dribbler,#Playmaker ,#Engine,#Distance Shoot...","Leadership,Early Crosser,Long Passer,Long Shot...","CAM,CM",68
7,79,80,78,80,89,Normal,87,62,77,16,...,High / Medium,28,1988/Aug/21,185,Robert Lewandowski,https://cdn.sofifa.org/1x/18/players/188545.png,#Clinical Finisher,"Injury Free,Finesse Shot,Chip Shot",ST,79
8,93,54,93,93,93,Normal,89,81,83,36,...,High / Medium,26,1991/Jan/7,173,Eden Hazard,https://cdn.sofifa.org/1x/18/players/183277.png,"#Speedster,#Dribbler,#Acrobat","Tries To Beat Defensive Line,Finesse Shot,Flai...","LW,CF",76
9,65,60,71,71,89,Normal,85,85,85,18,...,Medium / Medium,27,1990/Jan/4,183,Toni Kroos,https://cdn.sofifa.org/1x/18/players/182521.png,"#Playmaker ,#Distance Shooter","Long Passer,Long Shot Taker,Playmaker,Technica...","CM,CDM",76


In [3]:
df.isnull().any()

Acceleration                False
Aggression                  False
Agility                     False
Balance                     False
BallControl                 False
Body Type                   False
Composure                   False
Crossing                    False
Curve                       False
Dislike                     False
Dribbling                   False
FKAccuracy                  False
Finishing                   False
Follow                      False
GKDiving                    False
GKHandling                  False
GKKicking                   False
GKPositioning               False
GKReflexes                  False
HeadingAccuracy             False
Interceptions               False
International Reputation    False
Jumping                     False
Like                        False
LongPassing                 False
LongShots                   False
Marking                     False
Overall Rating              False
Penalties                   False
Position      

In [4]:
print(np.unique(df['Preferred Foot']))
print(np.unique(df['Body Type']))

['Left' 'Right']
['Akinfenwa' 'C. Ronaldo' 'Courtois' 'Lean' 'Messi' 'Neymar' 'Normal'
 'Shaqiri' 'Stocky']


### Data Cleaning

In [5]:
df.age = df.age.astype(np.int)

### Prefered Foot Conversion

In [6]:
df['Preferred Foot'] = df['Preferred Foot'].map({'Left': 0, 'Right': 1})

### Price Conversion 

In [7]:
currencies = np.array([list(value)[0] for value in df.Value.tolist()])
print(np.unique(currencies, return_counts=True))

(array(['+', '-', '5', '6', '7', '8', '€'],
      dtype='<U1'), array([    7,    14,     3,    68,    53,     2, 17908], dtype=int64))


In [8]:
def value_to_num(col):
    if pd.isnull(col): return 0
    
    value = col.replace('€', '').replace('M', '').replace('K', '')
    
    if col[-1] == 'M': unit = 1e6
    elif col[-1] == 'K': unit = 1e3
    else: unit = 1
    
    return float(value)*unit

df.Value = df.Value.apply(value_to_num)
df.Wage = df.Wage.apply(value_to_num)
df['Release Clause'] = df['Release Clause'].apply(value_to_num)

In [9]:
df[['name', 'Value', 'Wage', 'Release Clause']].head()

Unnamed: 0,name,Value,Wage,Release Clause
0,Lionel Messi,118500000.0,565000.0,242900000.0
1,C. Ronaldo dos Santos Aveiro,95500000.0,565000.0,195800000.0
2,Neymar da Silva Santos Jr.,119500000.0,280000.0,230000000.0
3,Luis Suárez,97000000.0,510000.0,198900000.0
4,Manuel Neuer,61000000.0,230000.0,100700000.0


# Training 

In [10]:
df = df[df.columns.difference(['name', 'photo_url', 'birth_date', 'Real Face', 'Body Type'])] # unuseful features
df = df[df.columns.difference(['positions', 'Work Rate', 'player_hashtags', 'player_traits'])] # maybe useful features (for future!)
samples = df.values

print(samples.shape)

(18055, 50)


# Prediction 