In [4]:
# Installing packages 
# uncomment based on requirement

# !pip install scikit-learn
# !pip install matplotlib
# !pip install numpy
# !pip install pandas
# !pip install seaborn
# !pip install scikit-optimize

In [5]:
#importing packages
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score,precision_score,f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib

In [6]:
#importing datasets
df_train = pd.read_csv(r"C:\Users\karimia\Downloads\train.csv")
df_test = pd.read_csv(r"C:\Users\karimia\Downloads\test.csv")

psgId = df_test["PassengerId"] #extracting passenger IDs
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
rs = np.random.RandomState(0)
df1 = df_train.loc[:, ['Survived', 'Pclass', 'Age', 'SibSp','Parch','Fare']]
corr = df1.corr()
corr.style.background_gradient()
#By looking at correlation matrix, we can get better overview on the features

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
Survived,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [9]:
# The dataset cleaning for NaN, which is filled by "U" for embarked and Cabin Number and median for 
# "SibSp","Parch","Fare", and "Age" features
def clean(data):
    data = data.drop(["Name","Ticket","PassengerId"], axis = 1)
    
    cols = ["SibSp","Parch","Fare","Age"]
    for col in cols:
        data[col].fillna(data[col].median(), inplace = True)
        
    data.Embarked.fillna("U", inplace = True)
    data.Cabin.fillna("U", inplace = True)
    return data 

df_train = clean(df_train)
df_test = clean(df_test)

In [10]:
#Labeling cabin based on the first letter
cabin1 = df_train.loc[:,'Cabin']
cabin2 = df_test.loc[:,'Cabin']

cabins1 = []
cabins2 = []
for i in range(len(cabin1)):
    cabins1 += cabin1[i][0]

for i in range(len(cabin2)):
    cabins2 += cabin2[i][0]
    
df_train['Cabin'] = cabins1
df_test['Cabin'] = cabins2

In [11]:
# Labeling columns "Sex","Embarked","Cabin"
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
cols = ["Sex","Embarked","Cabin"]

for col in cols:
    df_train[col] = le.fit_transform(df_train[col])
    df_test[col] = le.fit_transform(df_test[col])
    print(le.classes_)

['female' 'male']
['C' 'Q' 'S']
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'U']


In [12]:
# Splitting train and test datasets
y = df_train["Survived"]
x = df_train.drop("Survived",axis = 1)
x_train, x_val, y_train, y_val = train_test_split(x,y, test_size = 0.2, random_state = 42)

## Random Foerest

In [13]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
y_pred = rf.predict(x_val)
RF_predictions_sub = rf.predict(df_test)

In [14]:
print('Accuracy Score', ' : ', accuracy_score(y_val,y_pred))
print('precision_score', ' : ', precision_score(y_val,y_pred))
print('F1', ' : ', f1_score(y_val,y_pred))

Accuracy Score  :  0.7877094972067039
precision_score  :  0.75
F1  :  0.7397260273972602


# Random Foerest with Randomized Search hyperparameter tuning

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

import warnings 

In [16]:
#Define hyperparameters for Random Forest
rf_params = {
    'n_estimators': [100, 200, 300], 
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']}

In [17]:
rf_random_search = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42),
                                      param_distributions=rf_params, n_iter=10, cv=5, random_state=42)
rf_random_search.fit(x_train, y_train)

15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\karimia\AppData\Local\anaconda3\envs\Gurobi\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\karimia\AppData\Local\anaconda3\envs\Gurobi\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\karimia\AppData\Local\anaconda3\envs\Gurobi\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\karimia\AppData\Local\anaconda3\envs\Gurobi\Lib\site-pac

In [18]:
rf_random_search.cv_results_
tuning_result_rf_rs = pd.DataFrame(rf_random_search.cv_results_)
tuning_result_rf_rs
rf_random_search.best_params_

{'n_estimators': 300,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'log2',
 'max_depth': 30,
 'criterion': 'entropy'}

In [19]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators= 300,min_samples_split= 5,min_samples_leaf= 2,max_features= 'log2',max_depth= 30,
                            criterion= 'entropy')
rf.fit(x_train, y_train)
y_pred = rf.predict(x_val)
RF_predictions_sub = rf.predict(df_test)
print('Accuracy Score', ' : ', accuracy_score(y_val,y_pred))
print('precision_score', ' : ', precision_score(y_val,y_pred))
print('F1', ' : ', f1_score(y_val,y_pred))

Accuracy Score  :  0.8156424581005587
precision_score  :  0.8059701492537313
F1  :  0.7659574468085106


In [20]:
predictions_sub = rf.predict(df_test)
df = pd.DataFrame({"PassengerId":psgId.values,
                  "Survived": RF_predictions_sub })
df.to_csv("Submission.csv", index = False)

# Random Foerest with hyperparameter tuning with beysian optimiser

In [24]:
import skopt
from skopt import BayesSearchCV

#Define hyperparameters for Random Forest
rf_params = {
    'n_estimators': [100, 200, 300], 
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]}

In [25]:
optimizer = BayesSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    search_spaces=rf_params,
    scoring='neg_mean_absolute_error',
    cv=5,
    n_iter=100,
    return_train_score=False,
    n_jobs=-1
)

In [26]:
optimizer.fit(x_train, y_train)
best_hyperparameters = optimizer.best_params_
best_hyperparameters

OrderedDict([('criterion', 'entropy'),
             ('max_depth', 10),
             ('min_samples_leaf', 4),
             ('min_samples_split', 5),
             ('n_estimators', 100)])

In [27]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators= 100,min_samples_split= 5,min_samples_leaf= 4,max_depth= 10,
                            criterion= 'entropy')
rf.fit(x_train, y_train)
y_pred = rf.predict(x_val)
RF_predictions_sub = rf.predict(df_test)
print('Accuracy Score', ' : ', accuracy_score(y_val,y_pred))
print('precision_score', ' : ', precision_score(y_val,y_pred))
print('F1', ' : ', f1_score(y_val,y_pred))

Accuracy Score  :  0.8324022346368715
precision_score  :  0.84375
F1  :  0.782608695652174


## Logistic Regression

In [28]:
clf = LogisticRegression( random_state = 2, max_iter = 1000).fit(x_train,y_train)
predictions = clf.predict(x_val)

In [29]:
print('Accuracy Score', ' : ', accuracy_score(y_val,predictions))
print('precision_score', ' : ', precision_score(y_val,predictions))
print('F1', ' : ', f1_score(y_val,predictions))

Accuracy Score  :  0.8212290502793296
precision_score  :  0.8
F1  :  0.7777777777777778


## Results

In [31]:
# For exporting results based on csv format

In [14]:
# predictions_sub = clf.predict(df_test)
# df = pd.DataFrame({"PassengerId":psgId.values,
#                   "Survived": RF_predictions_sub })
# df.to_csv("Submission.csv", index = False)