In [396]:
# reference: 

# https://www.analyticsvidhya.com/blog/2017/07/introduction-to-genetic-algorithm/
# http://rhiever.github.io/tpot/
# https://www.youtube.com/watch?v=BEquIwfEXes&list=PLXO45tsB95cJyeE6BgkApUbAREpkoPDvG&index=1

## 1. Intuition behind Genetic Algorithms

In [397]:
# It is not the strongest of the species that survives, nor the most intelligent , 
# but the one most responsive to change

## 2 Steps Involved in Genetic Algorithm

In [398]:
# 1,Initialisation (population)
# 2,Fitness Function    # a measurement of degree to which it adapts to the environment 
# 3,Selection
# 4,Crossover
# 5,Mutation
# 6,good offsprings (good fitness) replace some from original population

## 3 Implementation using TPOT library  (Tree-based Pipeline Optimisation Technique) 

In [399]:
# TPOT tutorial on the Titanic dataset
# TPOT is a Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming.

In [115]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import collections

In [8]:
titanic = pd.read_csv("Logistic-Regression/titanic_train.csv")
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### EDA

In [17]:
titanic.groupby("Sex")["Survived"].value_counts()

Sex     Survived
female  1           233
        0            81
male    0           468
        1           109
Name: Survived, dtype: int64

In [18]:
titanic.groupby(["Pclass", "Sex"])["Survived"].value_counts()

Pclass  Sex     Survived
1       female  1            91
                0             3
        male    0            77
                1            45
2       female  1            70
                0             6
        male    0            91
                1            17
3       female  0            72
                1            72
        male    0           300
                1            47
Name: Survived, dtype: int64

In [28]:
i = pd.crosstab([titanic.Pclass, titanic.Sex], titanic.Survived.astype(float))

In [65]:
i.div(i.sum(1).astype(float), axis =0)  

# Divide each row of a DataFrame by another DataFrame vector
# 0 represents row, 1 represents column

Unnamed: 0_level_0,Survived,0.0,1.0
Pclass,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1
1,female,0.031915,0.968085
1,male,0.631148,0.368852
2,female,0.078947,0.921053
2,male,0.842593,0.157407
3,female,0.5,0.5
3,male,0.864553,0.135447


### Data Munging

In [83]:
# The first and most important step in using TPOT on any data set is to 
# rename the target class/response variable to class.

In [67]:
titanic.rename(columns={"Survived":"class"}, inplace=True)

In [84]:
# At present, TPOT requires all the data to be in numerical format.
# For nan i.e. the missing values, we simply replace them with a placeholder value (-999).

In [79]:
titanic["Sex"] = titanic["Sex"].map({"male":0, "female":1})
titanic["Embarked"] = titanic["Embarked"].map({"S":0, "C":1, "Q":2})

In [82]:
titanic = titanic.fillna(-999)
pd.isnull(titanic).any()

PassengerId    False
class          False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin          False
Embarked       False
dtype: bool

In [86]:
# Since Name and Ticket have so many levels, we drop them from our analysis for the sake of simplicity. 
# For Cabin, we encode the levels as digits using Scikit-learn's MultiLabelBinarizer 
# and treat them as new features.

In [176]:
# very useful technique! something kind of like get_dummies

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
Cabin_trans = mlb.fit_transform( [ {str(val)} for val in titanic['Cabin'].values] )    # {str(val)} instead of str(val)

In [180]:
titanic_new = titanic.drop(['Name','Ticket','Cabin','class'], axis=1)

In [190]:
assert (len(titanic['Cabin'].unique()) == len(mlb.classes_)), "Not Equal" #check correct encoding done

In [196]:
titanic_new = np.hstack((titanic_new, Cabin_trans))

In [201]:
titanic_new.shape

(891, 156)

In [202]:
titanic_class = titanic["class"].values

### Data Analysis using TPOT

In [226]:
training_indices, validation_indices = training_indices, testing_indices = train_test_split(
    titanic.index, stratify = titanic_class, train_size=0.75, test_size=0.25)

# This stratify parameter makes a split so that the proportion of values in the sample produced
# will be the same as the proportion of values provided to parameter stratify.




In [227]:
training_indices.size, validation_indices.size

(668, 223)

In [247]:
tpot = TPOTClassifier(generations=15 ,verbosity=2, max_time_mins=2, max_eval_time_mins=0.04, population_size=40)

tpot.fit(titanic_new[training_indices], titanic_class[training_indices])
# cool ! I never thought I could select rows this way



Optimization Progress: 81pipeline [00:31,  2.05pipeline/s]                  

Generation 1 - Current best internal CV score: 0.8293638740300169


Optimization Progress: 124pipeline [00:56,  2.20pipeline/s]                   

Generation 2 - Current best internal CV score: 0.8293638740300169


Optimization Progress: 166pipeline [01:26,  1.65pipeline/s]                   

Generation 3 - Current best internal CV score: 0.8352785778708794


Optimization Progress: 210pipeline [01:59,  1.20s/pipeline]                   

Generation 4 - Current best internal CV score: 0.8352785778708794


                                                                              


2.0806005 minutes have elapsed. TPOT will close down.
TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: RandomForestClassifier(MaxAbsScaler(input_matrix), bootstrap=False, criterion=entropy, max_features=0.2, min_samples_leaf=1, min_samples_split=11, n_estimators=100)


TPOTClassifier(config_dict={'sklearn.naive_bayes.GaussianNB': {}, 'sklearn.naive_bayes.BernoulliNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.naive_bayes.MultinomialNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.tree.DecisionT....45,
        0.5 ,  0.55,  0.6 ,  0.65,  0.7 ,  0.75,  0.8 ,  0.85,  0.9 ,
        0.95,  1.  ])}}}},
        crossover_rate=0.1, cv=5, disable_update_check=False,
        early_stop=None, generations=1000000, max_eval_time_mins=0.04,
        max_time_mins=2, memory=None, mutation_rate=0.9, n_jobs=1,
        offspring_size=40, periodic_checkpoint_folder=None,
        population_size=40, random_state=None, scoring=None, subsample=1.0,
        verbosity=2, warm_start=False)

In [248]:
tpot.score(titanic_new[validation_indices], titanic.loc[validation_indices, "class"].values)

0.79372197309417036

In [249]:
tpot.export('tpot_titanic_pipeline.py')   # export Best pipeline

True

In [None]:
# %load tpot_titanic_pipeline.py    # wow magic way of loading .py file
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8352785778708794
exported_pipeline = make_pipeline(
    MaxAbsScaler(),
    RandomForestClassifier(bootstrap=False, criterion="entropy", max_features=0.2, min_samples_leaf=1, min_samples_split=11, n_estimators=100)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)


### Make predictions on the submission data

In [376]:
titanic_sub = pd.read_csv("Logistic-Regression/titanic_test.csv")

In [377]:
# The most important step here is to check for new levels in the categorical variables of the submission dataset
# that are absent in the training set. We identify them and set them to our placeholder value of '-999', 
# i.e., we treat them as missing values. This ensures training consistency, 
# as otherwise the model does not know what to do with the new levels in the submission dataset.

In [378]:
for var in ['Cabin']: 
    new = list(set(titanic_sub[var]) - set(titanic[var]))     # good to know that set is very useful
    titanic_sub.loc[titanic_sub[var].isin(new), var] = -999

In [379]:
titanic_sub['Sex'] = titanic_sub['Sex'].map({'male':0,'female':1})
titanic_sub['Embarked'] = titanic_sub['Embarked'].map({'S':0,'C':1,'Q':2})

In [380]:
titanic_sub = titanic_sub.fillna(-999)
pd.isnull(titanic_sub).any()

PassengerId    False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin          False
Embarked       False
dtype: bool

In [381]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
SubCabinTrans = mlb.fit([{str(val)} for val in titanic['Cabin'].values]).transform([{str(val)} for val in titanic_sub['Cabin'].values])

titanic_sub = titanic_sub.drop(['Name','Ticket','Cabin'], axis=1)

In [385]:
titanic_sub_new = np.hstack((titanic_sub.values, SubCabinTrans))

In [388]:
assert (titanic_new.shape[1] == titanic_sub_new.shape[1]), "Not Equal"

In [391]:
# generate the predictions 
submission = tpot.predict(titanic_sub_new)

In [394]:
# output submission file 

# final = pd.DataFrame({"PassengerId":titanic_sub["PassengerId"], "Survived":submission})
# final.to_csv("titanic_submission.csv", index=False)