In [1]:
# @Author: Apeksha
# Code to extract features from dataframes and generate multiclass labels for each of the records.

import numpy as np
import pandas as pd

# Path to the players file.
path="B:\MS_SCU\Winter20\DataMiningAndPatterRecognition\Project\Dataset\players_19.csv"

# Seperating label columns and training feature columns
attribute_columns=["sofifa_id","player_url","short_name","long_name","age","dob","height_cm","weight_kg","nationality","club","overall","potential","value_eur","wage_eur","preferred_foot","international_reputation","weak_foot","skill_moves","work_rate","body_type","real_face","release_clause_eur","player_tags","team_jersey_number","loaned_from","joined","contract_valid_until","nation_jersey_number","pace","shooting","passing","dribbling","defending","physic","gk_diving","gk_handling","gk_kicking","gk_reflexes","gk_speed","gk_positioning","player_traits ","attacking_crossing","attacking_finishing","attacking_heading_accuracy","attacking_short_passing","attacking_volleys","skill_dribbling","skill_curve","skill_fk_accuracy","skill_long_passing","skill_ball_control","movement_acceleration","movement_sprint_speed","movement_agility","movement_reactions","movement_balance","power_shot_power","power_jumping","power_stamina","power_strength","power_long_shots","mentality_aggression","mentality_interceptions","mentality_positioning","mentality_vision","mentality_penalties","mentality_composure","defending_marking","defending_standing_tackle","defending_sliding_tackle","goalkeeping_diving","goalkeeping_handling","goalkeeping_kicking","goalkeeping_positioning","goalkeeping_reflexes"]
label_columns=["sofifa_id","ls","st","rs","lw","lf","cf","rf","rw","lam","cam","ram","lm","lcm","cm","rcm","rm","lwb","ldm","cdm","rdm","rwb","lb","lcb","cb","rcb","rb"]

# Dataframe contains all the records 
players_records=pd.read_csv(path,sep=",",header="infer")

# Dataframe with feature columns
player_attributes_df=players_records.filter(items=attribute_columns)

#Dataframe with label columns
label_attributes_df=players_records.filter(items=label_columns)

In [2]:
# Getting ids of all the records whose label attributes are null.
null_sofifa_ids=label_attributes_df[label_attributes_df["ls"].isnull()]["sofifa_id"]

# Filter out all the records with null label attributes
label_attributes_df=label_attributes_df[label_attributes_df.ls.notnull()]

In [3]:
# This method takes data frames with all the label attributes and generates labels for player record.
def extract_labels(label_attributes_df):
    ids=label_attributes_df.sofifa_id
    label_attributes_df.drop(columns=["sofifa_id"],inplace=True)
    
    # Splitting each column values by "+" and storing the addition of splitted values.
    # Ex: label["ls"]= 93+1 =94
    for col in label_attributes_df.columns:
        label_attributes_df[col]=label_attributes_df[col].apply(lambda v:v.split('+')).apply(lambda s: int(s[0])+int(s[1]))
        
    # Finding all the position which has max values. 
    label_attributes_df=label_attributes_df[label_attributes_df.apply(lambda x: x == label_attributes_df.max(axis = 1))] 
    classes=[]
    
    # Getting the column names
    for i in range(label_attributes_df.shape[0]):
        l=list(label_attributes_df.columns[np.isfinite(label_attributes_df.iloc[i])])
        classes.append(l)
    
    labels=pd.DataFrame();
    labels["sofifa_id"]=ids
    labels["classes"]=classes
    #labels=label_attributes_df.filter(items=["sofifa_id","classes"])
    return labels

In [4]:
labels = extract_labels(label_attributes_df)

In [5]:
labels

Unnamed: 0,sofifa_id,classes
0,20801,"[ls, st, rs]"
1,158023,"[lf, cf, rf, lam, cam, ram]"
2,190871,"[lw, lf, cf, rf, rw, lam, cam, ram]"
4,192985,"[lam, cam, ram, lm, rm]"
5,155862,"[lcb, cb, rcb]"
...,...,...
17765,238985,"[ldm, cdm, rdm, lcb, cb, rcb]"
17766,240160,"[lw, rw]"
17767,241304,"[lm, rm, lwb, rwb, lb, rb]"
17768,240158,"[lw, rw]"


In [6]:
# Removes records from dataframe for which we dont have labels.
player_attributes_df=player_attributes_df[~player_attributes_df['sofifa_id'].isin(null_sofifa_ids)]

# Join the attributes and labels to form training dataframe.
train_df=player_attributes_df.join(labels.set_index('sofifa_id'),on="sofifa_id")

In [7]:
train_df.shape

(15784, 75)