## **Importing libraries and some config**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
warnings.filterwarnings('ignore')
sns.set(style='whitegrid')

## **Acquiring data**

In [2]:
train_df=pd.read_csv('files_nuevoReto/olympics_train.csv')
test_df=pd.read_csv('files_nuevoReto/olympics_test.csv')
combine=[train_df,test_df]

### **Wrangle data**

**Eliminaremos los duplicados de los dos datasets**

In [3]:
train_df.shape,test_df.shape

((216892, 15), (54090, 14))

In [4]:
#train_df=train_df.drop_duplicates().copy()
#test_df=test_df.drop_duplicates().copy()
#combine=[train_df,test_df]

In [5]:
train_df.shape,test_df.shape

((216892, 15), (54090, 14))

##### **Droppear las columnas que no nos interesa**

In [6]:
X,y=train_df.drop('Medal',axis=1),train_df['Medal']
X_test=test_df.copy()

In [7]:
drop_cols=['ID','Name','Season','Sex'] #Se decidió eliminar Season y Sex debido al grafico de feature importance
keep_cols=[col for col in X.columns if col not in drop_cols]

In [8]:
X=X[keep_cols].copy()
X_test=X_test[keep_cols].copy()

##### **Completando los valores vacios**

In [9]:
#Division de las variables categoricas con las numericas
cont_cols=[col for col in keep_cols if pd.api.types.is_numeric_dtype(X[col])]
cat_cols=[col for col in keep_cols if col not in cont_cols]

In [10]:
#Podemos rellenar los valores vacíos de Age con la relacion que tiene con Sexo y Sport posiblemente
#Pero ahora solo rellenaremos con la media
#means={columna:{mean:value,is_null:value}}
def refill_colnum(data,columns,means=None):
    train=means is None
    if train:
        means={}
    for col in columns:
        if train:
            means[col]={}
            means[col]['mean']=data[col].mean()
        if train and data[col].isna().sum() > 15:
            means[col]['is_null']=True
        if 'is_null' in means[col]:
            data[f'{col}Missing']=data[col].isnull()
        data[col]=data[col].fillna(means[col]['mean'])
    return means

In [11]:
means=refill_colnum(X,cont_cols)
refill_colnum(X_test,cont_cols,means);

**Conversion de la variable Year a categorico**

Sin convertirlo a categorico, resultados : decission tree=0.8949287713553361, KNN=0.9139457951479895, LIGHT=0.9170611314333948
Convirtiendo a categorico, resultados: decission:0.8956477641307586, KNN=0.9144804644384558,LIGHT=0.9170611314333948
Sin convertirlo a categorio y estratificando,resultados: decission=0.8949152622490429, KNN= 0.9145692452447041, LIGHT=0.9170611314333948

In [12]:
def conversionYear(dataset):
    for ds in dataset:
        ds['Year'] = ds['Year'].astype(str)

In [13]:
conversionYear([X,X_test])

In [14]:
cat_cols.append('Year')

In [15]:
cat_cols

['Team', 'NOC', 'Games', 'City', 'Sport', 'Event', 'Year']

##### **Conversión de columnas a valores discretos**

In [16]:
# Juntamos X y Y por un momento para ver si es que el discretizar las edades tendrá correlación con la variable Medal
X_junto=X.copy()
X_junto['Medal']=y

In [17]:
col_to_stra=['Age','Height','Weight']#,'Year']
dataset=[X,X_test]
def stratify(col_to_stra,num,dataset):
    for col in col_to_stra:
        #Esto se hizo en un inicio para ver las relaciones que existian entre la discretizacion creada y la variable Medal
        #Dentro de la función no se puede visualizar porque no se está retornando el dataframe, pero si se desea se puede hacer para visualizar la relacion
        #Es muy parecido a lo que se hizo en la parte de correlacion de variables 
        X_junto[f'{col}Band'] = pd.cut(X_junto[col], num) #Se crea "num" cantidad de intervalos
        gruBand=X_junto[[f'{col}Band','Medal']][train_df.Medal != 'None'].groupby(f'{col}Band',as_index=False).count().sort_values(by='Medal',ascending=False)
        gruBand['Medal']=gruBand['Medal'].apply(lambda x: (x/len(train_df[train_df.Medal != 'None']))*100)
        
        #Recorremos el dataset para otorgar valor si es que se encuentra en algun intervalo
        #Ejm: intervalo inferior < edad <= intervalo superior 
        for ds in dataset:
            for i in range(len(gruBand)):
                ds.loc[(ds[col] > gruBand[f'{col}Band'][i].left) & (ds[col] <= gruBand[f'{col}Band'][i].right),col]=i
stratify(col_to_stra,2,dataset)

In [18]:
X.head()

Unnamed: 0,Age,Height,Weight,Team,NOC,Games,Year,City,Sport,Event,AgeMissing,HeightMissing,WeightMissing
0,0.0,1.0,0.0,France,FRA,2016 Summer,2016,Rio de Janeiro,Rugby Sevens,Rugby Sevens Men's Rugby Sevens,False,False,False
1,0.0,0.0,0.0,Finland,FIN,1952 Summer,1952,Helsinki,Weightlifting,Weightlifting Men's Bantamweight,False,False,False
2,0.0,0.0,0.0,United States,USA,1932 Summer,1932,Los Angeles,Art Competitions,"Art Competitions Mixed Sculpturing, Unknown Event",False,True,True
3,0.0,0.0,0.0,Kazakhstan,KAZ,2006 Winter,2006,Torino,Cross Country Skiing,Cross Country Skiing Women's 30 kilometres,False,False,False
4,0.0,0.0,0.0,East Germany,GDR,1972 Summer,1972,Munich,Swimming,Swimming Women's 200 metres Backstroke,False,False,False


##### **Transformacion de las variables categoricas**

In [19]:
X[cat_cols].nunique()

Team     1147
NOC       230
Games      51
City       42
Sport      66
Event     765
Year       35
dtype: int64

NOC tiene mucha cardinalidad, es posible que no debamos tenerlo en cuenta, revisar esto

In [20]:
from sklearn.preprocessing import LabelEncoder

#encoder={cat_col:labelEncoder}

def preprocess_cats(X, cat_cols, lencoders=None):
    train = lencoders is None
    # Initialize label encoders
    if train:
        lencoders = {}
    for c in cat_cols:
        # Parse to string before encoding
        X[c] = X[c].astype(str)
        if train: 
            lencoders[c] = LabelEncoder().fit(X[c])
            
            #Creamos una etiqueta 'unseen' en el encoder para los valores desconocidos de X_test
            lencoders[c].classes_=np.append(lencoders[c].classes_,'unseen')
        else:
            #Suplantamos los valores desconocidos por el valor 'unseen' para poder reemplazarlo luego
            valores=[label if label in lencoders[c].classes_ else 'unseen' for label in X[c]]
            X[c]=valores
        X[c] = lencoders[c].transform(X[c])
    return lencoders

In [21]:
lencoders=preprocess_cats(X,cat_cols)
preprocess_cats(X_test,cat_cols,lencoders);

## **Machine Learning**

In [22]:
from sklearn.model_selection import train_test_split
X_train,X_validation,y_train,y_validation=train_test_split(X,y,test_size=0.1,random_state=2019)

In [23]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier().fit(X_train,y_train)

In [24]:
dt.score(X_train,y_train),dt.score(X_validation,y_validation)

(0.9778024815319515, 0.8937759336099586)

In [25]:
from sklearn.metrics import f1_score
y_train_pred = dt.predict(X_train)
y_validation_pred = dt.predict(X_validation)
f1_score(y_train, y_train_pred, average='weighted'),f1_score(y_validation, y_validation_pred, average='weighted')

#0.8922995738414508 Todo estratificado a 5
#0.891264019192194 Todo estratificado a 6

(0.9776638899206604, 0.8947955166969662)

#### **Knn **

In [26]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors =20,weights='distance',algorithm='auto').fit(X_train, y_train) #n_neigh=20

In [27]:
knn.score(X_train,y_train),knn.score(X_validation,y_validation)
#0.9189

(0.9778024815319515, 0.918118948824343)

In [28]:
## y_train_pred = knn.predict(X_train)
y_validation_pred = knn.predict(X_validation)
f1_score(y_train, y_train_pred, average='weighted'),f1_score(y_validation, y_validation_pred, average='weighted')
#0.9144

(0.9776638899206604, 0.9136540855202635)

**Probando mas modelos para mejorar el resultado**

### **LIGHTGBM**

In [59]:
import lightgbm as lgb

In [60]:
clf = lgb.LGBMClassifier(task = 'train',objective = 'multiclass',boosting_type = 'gbdt',
                          metric = 'multi_logloss',learning_rate=0.4,max_bin=1000,num_iterations=1800) #iterations:1500
clf.fit(X_train,y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.4, max_bin=1000,
               max_depth=-1, metric='multi_logloss', min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
               n_jobs=-1, num_iterations=1800, num_leaves=31,
               objective='multiclass', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0, task='train')

**Optimizacion bayessina**

In [61]:
clf.score(X_train,y_train),clf.score(X_validation,y_validation)
#0.9231

(0.9667575127304023, 0.9226371599815584)

In [62]:
## y_train_pred = knn.predict(X_train)
y_validation_pred = clf.predict(X_validation)
f1_score(y_validation, y_validation_pred, average='weighted')
#0.9170

0.9166539208971859

**Creacion del output**

In [53]:
y_test_pred=knn.predict(X_test)

In [54]:
y_test_pred.shape

(54090,)

In [55]:
submission=pd.read_csv('files_nuevoReto/medals_sammple_submission.csv')

In [56]:
submission=submission.drop('Rating',axis=1)
submission.head()

Unnamed: 0,Id
0,0
1,1
2,2
3,3
4,4


In [57]:
submission['Medal']=list(y_test_pred)
submission.to_csv('nuevo_v1_olympics_submission.csv', index=False)

In [58]:
submission.head()

Unnamed: 0,Id,Medal
0,0,
1,1,
2,2,
3,3,
4,4,Gold


**Creando output con light**

In [63]:
y_test_pred=clf.predict(X_test)

In [64]:
submission=pd.read_csv('files_nuevoReto/medals_sammple_submission.csv')
submission=submission.drop('Rating',axis=1)
submission.head()

Unnamed: 0,Id
0,0
1,1
2,2
3,3
4,4


In [65]:
submission['Medal']=list(y_test_pred)
submission.to_csv('nuevo_v2_light_olympics_submission.csv', index=False)

In [66]:
submission.head()

Unnamed: 0,Id,Medal
0,0,
1,1,
2,2,
3,3,
4,4,Gold
