# Importations

In [1]:
%matplotlib inline
from __future__ import division #Importer ça avant toute chose, autrement ne marche pas
import graphlab as gl
#import pandas as pd
gl.canvas.set_target('ipynb') # Pour afficher les graphes dans mon notebook plutôt que new tab
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from pylab import pcolor, show, colorbar, xticks, yticks
import seaborn as sns
from sklearn import preprocessing

# Fonctions

In [34]:
#prediction_validation is the prediction done on the validation data using the validation set
#validation_data is the validation set
def calc_accuracy(prediction_validation, validation_data):
    accuracy = ((prediction_validation==validation_data["Survived"]).sum())/len(validation_data)
    return accuracy
#print("Accuracy of the model : %.3f" %accuracy)µ

In [2]:
# Calculate the mean age for a given class
def round_age(df, i_class):
    return round(df["Age"][df["Pclass"]==i_class].mean())

In [92]:
#Let's work with pandas then import in graphlab for use with models
df_data = pd.read_csv("./train.csv") #My training data
df_data_validation = pd.read_csv("./test.csv") #Data used for submission to Kaggle

# Data cleaning

In [93]:
df_data.loc[(df_data.Age.isnull()) & (df_data["Fare"] == 0), "Age"] = 0
df_data_validation.loc[(df_data_validation.Age.isnull()) & (df_data_validation["Fare"] == 0), "Age"] = 0

In [94]:
df_data_validation.loc[(df_data_validation.Fare.isnull()), "Fare"] = 9

In [95]:
mean_age = [round_age(df_data,i_class) for i_class in range(1,4,1)] #38, 29, 25
mean_age_valid = [round_age(df_data_validation,i_class) for i_class in range(1,4,1)] #41, 29, 24

In [96]:
for i in range(1,4,1):
    df_data.loc[(df_data.Age.isnull()) & (df_data["Pclass"] == i), "Age"] = mean_age[i-1]
    df_data_validation.loc[(df_data_validation.Age.isnull()) & (df_data_validation["Pclass"] == i), "Age"] = mean_age_valid[i-1]


In [97]:
x = df_data.values #Get only values from the dataframe
min_max_scaler = preprocessing.MinMaxScaler() #Create the scaling function
#x_scaled = min_max_scaler.fit_transform(x)
df_data["Age_n"] = min_max_scaler.fit_transform(df_data["Age"].reshape(-1,1))
df_data["Pclass_n"] = min_max_scaler.fit_transform(df_data["Pclass"].reshape(-1,1))
df_data["SibSp_n"] = min_max_scaler.fit_transform(df_data["SibSp"].reshape(-1,1))
df_data["Fare_n"] = min_max_scaler.fit_transform(df_data["Fare"].reshape(-1,1))
df_data.loc[df_data.Sex == "male", "Sex_n"] = 0
df_data.loc[df_data.Sex == "female", "Sex_n"] = 1



In [98]:
y = df_data_validation.values #Get only values from the dataframe
min_max_scaler = preprocessing.MinMaxScaler() #Create the scaling function

In [99]:
df_data_validation["Age_n"] = min_max_scaler.fit_transform(df_data_validation["Age"].reshape(-1,1))
df_data_validation["Pclass_n"] = min_max_scaler.fit_transform(df_data_validation["Pclass"].reshape(-1,1))
df_data_validation["SibSp_n"] = min_max_scaler.fit_transform(df_data_validation["SibSp"].reshape(-1,1))
df_data_validation["Fare_n"] = min_max_scaler.fit_transform(df_data_validation["Fare"].reshape(-1,1))
df_data_validation.loc[df_data_validation.Sex == "male", "Sex_n"] = 0
df_data_validation.loc[df_data_validation.Sex == "female", "Sex_n"] = 1



# Switch back to SFrames to use Graphlab

In [100]:
del df_data["Name"]
del df_data["Ticket"]
del df_data["Cabin"]
del df_data["Embarked"]
del df_data_validation["Name"]
del df_data_validation["Ticket"]
del df_data_validation["Cabin"]
del df_data_validation["Embarked"]

In [101]:
data = gl.SFrame(df_data)
data_for_validation = gl.SFrame(df_data_validation)

# Split data

In [26]:
(train, test) = data.random_split(.8)

# Build random forest classifier

In [102]:
target = "Survived"
features = ["Pclass_n", "Age_n", "SibSp_n", "Fare_n", "Sex_n"]

In [27]:
model = gl.random_forest_classifier.create(train, target = target, features=features, validation_set=None)

In [28]:
model.get_feature_importance()

name,index,count
Age_n,,98
Fare_n,,97
SibSp_n,,19
Pclass_n,,17
Sex_n,,9


In [37]:
model.get("training_accuracy")

0.8660351634025574

In [29]:
model.show(view="Tree")

In [32]:
predictions = model.predict(test)

In [33]:
predictions

dtype: int
Rows: 152
[0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, ... ]

# Evaluation of the model

In [38]:
model.evaluate(test)

{'accuracy': 0.8486842105263158,
 'auc': 0.8841886269070734,
 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 4
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      0       |        1        |   6   |
 |      1       |        1        |   32  |
 |      1       |        0        |   17  |
 |      0       |        0        |   97  |
 +--------------+-----------------+-------+
 [4 rows x 3 columns],
 'f1_score': 0.735632183908046,
 'log_loss': 0.40557728798670695,
 'precision': 0.8421052631578947,
 'recall': 0.6530612244897959,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+-----+-----+----+-----+
 | threshold | fpr | tpr | p  |  n  |
 +-----------+-----+-----+----+-----+
 |    0.0    | 1.0 | 1.0 | 49 | 103 |
 |   1e-05   | 1.0 | 1.0 | 49 | 103 |
 |   2e-05   | 1.0 | 1.0 | 49 | 10

In [35]:
calc_accuracy(predictions,test)

0.8486842105263158

## The accuracy on the validation set is 0.8486

## Precision and recall are : 0.8424 & 0.6530 

In [41]:
prediction_validation = model.predict(data_for_validation)

In [43]:
data_for_validation

PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Age_n,Pclass_n,SibSp_n,Fare_n
892,3,male,34.5,0,0,7.8292,0.453947368421,1.0,0.0,0.0152815806712
893,3,female,47.0,1,0,7.0,0.618421052632,1.0,0.125,0.0136630900601
894,2,male,62.0,0,0,9.6875,0.815789473684,0.5,0.0,0.0189087407081
895,3,male,27.0,0,0,8.6625,0.355263157895,1.0,0.0,0.0169080739493
896,3,female,22.0,1,1,12.2875,0.289473684211,1.0,0.125,0.0239836027304
897,3,male,14.0,0,0,9.225,0.184210526316,1.0,0.0,0.0180060008292
898,3,female,30.0,0,0,7.6292,0.394736842105,1.0,0.0,0.0148912066695
899,2,male,26.0,1,1,29.0,0.342105263158,0.5,0.125,0.0566042302488
900,3,female,18.0,0,0,7.2292,0.236842105263,1.0,0.0,0.014110458666
901,3,male,21.0,2,0,24.15,0.276315789474,1.0,0.25,0.0471376607072

Sex_n
0.0
1.0
0.0
0.0
1.0
0.0
1.0
0.0
1.0
0.0


In [None]:
result = gl.SFrame({'PassengerId':data_for_validation['PassengerId'],"Survived":prediction_validation})
result.export_csv("gender_submission_randomforest1.csv")

# Does imbalance affect the predictions ?

In [103]:
nb_dead, nb_alive = len(data[data["Survived"]==0]),len(data[data["Survived"]==1])
print "Balance of dead/alive : %.2f %% dead and %.2f %% alive" %((549/(549+342)*100),(342/(549+342)*100))

Balance of dead/alive : 61.62 % dead and 38.38 % alive


## Let's balance the data

In [104]:
dead_raw = data[data["Survived"]==0]
alive_raw = data[data["Survived"]==1]
ratio = len(alive_raw)/len(dead_raw)
print ratio
dead = dead_raw.sample(ratio)  # I'll sample my data by deleting random rows of people who did not survive
print "Initial length of dead", len(dead_raw)
print "New length of dead", len(dead)
alive = alive_raw
data_balanced = alive.append(dead)

0.622950819672
Initial length of dead 549
New length of dead 330


Now I have a data set containing the same number of dead or survived

In [105]:
len(data_balanced["Survived"]==0), len(data_balanced["Survived"]==1)

(672, 672)

# Let's train the model on the balanced data

In [106]:
train_balanced, test_balanced = data_balanced.random_split(0.8)

In [107]:
model_balanced = gl.random_forest_classifier.create(train_balanced, target = target, features=features, validation_set=None)

The training accuracy has decreased, maybe we have too few samples now ?

In [108]:
model_balanced.get("training_accuracy")

0.8775510191917419

The validation accuracy has decreased. Recall has increased.
We should investigate via cross validation

In [109]:
model_balanced.evaluate(test_balanced)

{'accuracy': 0.8120300751879699,
 'auc': 0.8905380333951761,
 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 4
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      1       |        0        |   19  |
 |      1       |        1        |   58  |
 |      0       |        0        |   50  |
 |      0       |        1        |   6   |
 +--------------+-----------------+-------+
 [4 rows x 3 columns],
 'f1_score': 0.822695035460993,
 'log_loss': 0.4652423571929549,
 'precision': 0.90625,
 'recall': 0.7532467532467533,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+-----+-----+----+----+
 | threshold | fpr | tpr | p  | n  |
 +-----------+-----+-----+----+----+
 |    0.0    | 1.0 | 1.0 | 77 | 56 |
 |   1e-05   | 1.0 | 1.0 | 77 | 56 |
 |   2e-05   | 1.0 | 1.0 | 77 | 56 |
 |   3e-05   |

In [110]:
predictions_balanced_validation = model_balanced.predict(data_for_validation)

# Let's export to kaggle

In [111]:
result = gl.SFrame({'PassengerId':data_for_validation['PassengerId'],"Survived":predictions_balanced_validation})
result.export_csv("gender_submission_randomforest3.csv")

# The score is now 0.76077

Although the data has been properly balanced, the score has strongly decreased. This is probably due to a too short data set, or the deleted rows might have been more meaningful entries

Note that if a new random sampling is done, the precision and accuracy of the model changes. This confirms that imbalacing such a small dataset is risky