In [1]:
import numpy as np
import pandas as pd

## Load data
https://www.kaggle.com/alopez247/pokemon

In [2]:
df = pd.read_csv('pokemon_alopez247.csv')

## Description of data

This database includes 21 variables per each of the 721 Pokémon of the first six generations, plus the Pokémon ID and its name. These variables are briefly described next:

 - **Number.** Pokémon ID in the Pokédex.
 - **Name.** Name of the Pokémon.
 - **Type_1.** Primary type.
 - **Type_2.** Second type, in case the Pokémon has it.
 - **Total.** Sum of all the base stats (Health Points, Attack, Defense, Special Attack, Special Defense, and Speed). 
 - **HP.** Base Health Points.
 - **Attack.** Base Attack.  
 - **Defense.** Base Defense.
 - **Sp_Atk.** Base Special Attack.
 - **Sp_Def.** Base Special Defense.
 - **Speed.** Base Speed.
 - **Generation.** Number of the generation when the Pokémon was introduced.
 - **isLegendary.** Boolean that indicates whether the Pokémon is Legendary or not.
 - **Color.** Color of the Pokémon according to the Pokédex.
 - **hasGender.** Boolean that indicates if the Pokémon can be classified as female or male.
 - **Pr_male.** In case the Pokémon has Gender, the probability of its being male. The probability of being female is, of course, 1 minus this value.
 - **Egg_Group_1.** Egg Group of the Pokémon.
 - **Egg_Group_2.** Second Egg Group of the Pokémon, in case it has two.
 - **hasMegaEvolution.** Boolean that indicates whether the Pokémon is able to Mega-evolve or not.
 - **Height_m.** Height of the Pokémon, in meters.
 - **Weight_kg.** Weight of the Pokémon, in kilograms.
 - **Catch_Rate.** Catch Rate.
 - **Body_Style.** Body Style of the Pokémon according to the Pokédex.

In [3]:
df.head()

Unnamed: 0,Number,Name,Type_1,Type_2,Total,HP,Attack,Defense,Sp_Atk,Sp_Def,...,Color,hasGender,Pr_Male,Egg_Group_1,Egg_Group_2,hasMegaEvolution,Height_m,Weight_kg,Catch_Rate,Body_Style
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,...,Green,True,0.875,Monster,Grass,False,0.71,6.9,45,quadruped
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,...,Green,True,0.875,Monster,Grass,False,0.99,13.0,45,quadruped
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,...,Green,True,0.875,Monster,Grass,True,2.01,100.0,45,quadruped
3,4,Charmander,Fire,,309,39,52,43,60,50,...,Red,True,0.875,Monster,Dragon,False,0.61,8.5,45,bipedal_tailed
4,5,Charmeleon,Fire,,405,58,64,58,80,65,...,Red,True,0.875,Monster,Dragon,False,1.09,19.0,45,bipedal_tailed


In [4]:
for col in df.columns:
    if df[col].dtype == int:
        df[col] = df[col].astype(float)

In [5]:
df['isLegendary'].value_counts()

False    675
True      46
Name: isLegendary, dtype: int64

## Split data

In [6]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

## Convert to array

In [7]:
def get_arrays(df):
    X = np.array(df[['HP', 'Attack', 'Defense', 'Sp_Atk', 'Sp_Def']])
    y = np.array(df['isLegendary'])
    
    return X, y

X_train, y_train = get_arrays(df_train)
X_test, y_test = get_arrays(df_test)

X_train.shape, y_train.shape

((576, 5), (576,))

## Simple pipeline

In [8]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression()
)

## Train and predict

In [9]:
model = pipeline.fit(X_train, y_train)
model.predict(X_train)[:5]

array([False, False, False, False, False], dtype=bool)

## Validate classifier

In [10]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_train, model.predict(X_train))

0.71810800234787719

In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

def cross_validate_auc(pipeline, X_train, y_train):
    results = cross_val_score(
        pipeline,
        X_train,
        y_train,
        scoring=make_scorer(roc_auc_score),
        cv=10,
    )

    return np.mean(results)
    
cross_validate_auc(pipeline, X_train, y_train)

0.71526030747728864

In [12]:
# Task 1
# Extend cross_validate to take any scoring function as argument and calculate accuracy.

# Your code here:
def cross_validate(pipeline, X_train, y_train, score_func):
    pass

## Custom transformers

In [13]:
from sklearn.base import BaseEstimator, TransformerMixin

class PandasSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, selected_columns):
        self.selected_columns = selected_columns
    
    def fit(self, df, *args):
        return self

    def transform(self, df):
        return np.array(df[self.selected_columns])

## Complex pipeline

In [14]:
from sklearn.pipeline import make_union

pipeline_stats = make_pipeline(
    PandasSelector(['HP', 'Attack', 'Defense', 'Sp_Atk', 'Sp_Def']),
    StandardScaler(),
)

pipeline_hasGender = make_pipeline(
    PandasSelector(['hasGender']),
)

pipeline = make_pipeline(
    make_union(
        pipeline_stats,
        pipeline_hasGender,
    ),
    LogisticRegression(),
)

cross_validate_auc(pipeline, df_train, y_train)

0.85596610761705105

### Categorical variables

In [15]:
pipeline_color = make_pipeline(
    PandasSelector(['Color']),
)

pipeline = make_pipeline(
    make_union(
        pipeline_stats,
        pipeline_hasGender,
        pipeline_color,
    ),
    LogisticRegression(),
)

cross_validate_auc(pipeline, df_train, y_train)

# it doesn't work since LogisticRegression won't work on string variable

ValueError: could not convert string to float: 'Yellow'

In [16]:
from sklearn.preprocessing import OneHotEncoder

pipeline_color = make_pipeline(
    PandasSelector(['Color']),
    OneHotEncoder(),
)

pipeline = make_pipeline(
    make_union(
        pipeline_stats,
        pipeline_hasGender,
        pipeline_color,
    ),
    LogisticRegression(),
)

cross_validate_auc(pipeline, df_train, y_train)

# it doesn't work since OneHotEncoder expects int

ValueError: could not convert string to float: 'Yellow'

In [19]:
class StringConverter(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.map = {} # column : string : int
    
    def fit(self, X, *args):
        for col in range(X.shape[1]):
            self.map[col] = {}
            idx = 1
            for row in range(X.shape[0]):
                s = X[row, col]
                if s not in self.map[col]:
                    self.map[col][s] = idx
                    idx += 1
        return self

    def transform(self, X):
        X_int = np.zeros(shape=X.shape)
        for col in range(X.shape[1]):
            for row in range(X.shape[0]):
                s = X[row, col]
                X_int[row, col] = self.map[col].get(s, 0)

        return X_int

In [20]:
pipeline_color = make_pipeline(
    PandasSelector(['Color']),
    StringConverter(),
    OneHotEncoder(),
)

pipeline = make_pipeline(
    make_union(
        pipeline_stats,
        pipeline_hasGender,
        pipeline_color,
    ),
    LogisticRegression(),
)

cross_validate_auc(pipeline, df_train, y_train)

0.88420684835779184

### Missing values

In [21]:
np.mean(df_train['Pr_Male'].isnull())

0.1076388888888889

In [22]:
from sklearn.preprocessing import Imputer

pipeline_PrMale = make_pipeline(
    PandasSelector(['Pr_Male']),
    Imputer(),
)

pipeline = make_pipeline(
    make_union(
        pipeline_stats,
        pipeline_hasGender,
        pipeline_color,
        pipeline_PrMale,
    ),
    LogisticRegression(),
)

cross_validate_auc(pipeline, df_train, y_train)

0.87263277428371766

In [23]:
# Task 2
# Write your own Imputer transformer

# Your code here:
class CustomImputer():
    pass

### Text data

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

pipeline_name = make_pipeline(
    PandasSelector('Name'),
    TfidfVectorizer(
        analyzer='char',
        ngram_range=(1, 5),
        min_df=10,
    ),
)

pipeline = make_pipeline(
    make_union(
        pipeline_stats,
        pipeline_hasGender,
        pipeline_color,
        pipeline_PrMale,
        pipeline_name,
    ),
    LogisticRegression(),
)

cross_validate_auc(pipeline, df_train, y_train)

0.88096610761705096

## Fine-tuning

In [25]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'logisticregression__C': [0.01, 0.1, 1, 10, 100],
    'logisticregression__class_weight': [None, 'balanced'],
}

grid = GridSearchCV(
    pipeline,
    parameters,
    scoring=make_scorer(roc_auc_score),
).fit(df_train, y_train)

print('Best params: {}'.format(grid.best_params_))
print('Best AUC: {:.3f}'.format(grid.best_score_))

final_model = grid.best_estimator_

Best params: {'logisticregression__C': 0.1, 'logisticregression__class_weight': 'balanced'}
Best AUC: 0.972


## Final evaluation

In [26]:
roc_auc_score(y_test, final_model.predict(df_test))

0.97445255474452552

## Save model

In [27]:
from sklearn.externals import joblib

joblib.dump(final_model, 'final_model.pkl');

In [28]:
loaded_model = joblib.load('final_model.pkl')
roc_auc_score(y_test, final_model.predict(df_test))

0.97445255474452552

In [29]:
# Task 3
# Experiment with sklearn transforms and models and build your own pipeline.
# http://scikit-learn.org/stable/modules/classes.html

# Your code here:
