# Start

In [1]:
import numpy as np
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns

#Pre-processing
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import KFold,StratifiedKFold

# Machine learning 
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,AdaBoostClassifier,ExtraTreesClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, SGDClassifier,Perceptron

from sklearn.tree import DecisionTreeClassifier

# Metrics
from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc,accuracy_score,roc_auc_score
# Managing Warnings 
import warnings
warnings.filterwarnings('ignore')

# Plot the Figures Inline
%matplotlib inline

# Functions

In [2]:
def Train_ByValidation(X_train,y_train,X_val,y_val,clf):     
    clf=clf.fit(X_train,y_train)
    Score_train = round(clf.score(X_train, y_train) * 100, 2)
    Score_val = round(clf.score(X_val, y_val) * 100, 2)    
    return Score_train,Score_val

def MakeDF(data_array,old_df,cols):
    new_df = pd.DataFrame(data_array, index=old_df.index, columns=cols)
    return new_df

def Scaling(X_train,X_val):
    ''' Scale features to the range of [0,1]'''
    scaler = MinMaxScaler().fit(X_train)
    X_train_new=scaler.transform(X_train)
    X_val_new=scaler.transform(X_val)
    selected_cols=X_train.columns
    return MakeDF(X_train_new,X_train,selected_cols),MakeDF(X_val_new,X_val,selected_cols) 

# Read Data

In [3]:
train_df=pd.read_csv("./output/train_modified.csv",index_col=0)
test_df=pd.read_csv("./output/test_modified.csv",index_col=0)
X_test=test_df

In [4]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize,IsAlone
0,0,3,0,22.0,7.25,0,1,2,0
1,1,1,1,38.0,71.2833,1,3,2,0
2,1,3,1,26.0,7.925,0,2,1,1
3,1,1,1,35.0,53.1,0,3,2,0
4,0,3,0,35.0,8.05,0,1,1,1


In [5]:
test_df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize,IsAlone
0,3,0,34.5,7.8292,2,1,1,1
1,3,1,47.0,7.0,0,3,2,0
2,2,0,62.0,9.6875,2,1,1,1
3,3,0,27.0,8.6625,0,1,1,1
4,3,1,22.0,12.2875,0,3,3,0


# Train Model

In [15]:
results=[]

X_train=train_df.drop(['Survived'], axis=1, errors='ignore')
y_train=train_df['Survived']

# Creating Train and validation sets
X,y=X_train,y_train
cv = StratifiedKFold(n_splits=2, random_state=42, shuffle=True)
for train_index, val_index in cv.split(X,y):
    X_train,X_val,y_train,y_val=X.iloc[train_index,:],X.iloc[val_index,:],y.iloc[train_index],y.iloc[val_index]

# data can be scaled to [0,1] in case LogisticRegression classifier    
# X_train,X_val=Scaling(X_train,X_val)

# Logistic Regression Classifier
clf = LogisticRegression( solver='sag', multi_class='multinomial',random_state=42)  
Score_train,Score_val=Train_ByValidation(X_train,y_train,X_val,y_val,clf)
results.append(['LogisticRegression',Score_train,Score_val])

# Random Forest Classifier
clf =RandomForestClassifier(n_estimators=100)
Score_train,Score_val=Train_ByValidation(X_train,y_train,X_val,y_val,clf)
results.append(['RandomForestClassifier',Score_train,Score_val])

# Decision Tree Classifier
clf = DecisionTreeClassifier()
Score_train,Score_val=Train_ByValidation(X_train,y_train,X_val,y_val,clf)
results.append(['DecisionTreeClassifier',Score_train,Score_val])

# SGD Classifier
clf = SGDClassifier()
Score_train,Score_val=Train_ByValidation(X_train,y_train,X_val,y_val,clf)
results.append(['SGDClassifier',Score_train,Score_val])

# Linear SVC
clf = LinearSVC()
Score_train,Score_val=Train_ByValidation(X_train,y_train,X_val,y_val,clf)
results.append(['LinearSVC',Score_train,Score_val])


# Perceptron
clf =  Perceptron()
Score_train,Score_val=Train_ByValidation(X_train,y_train,X_val,y_val,clf)
results.append(['Perceptron',Score_train,Score_val])

# K Neighbors Classifier
clf = KNeighborsClassifier(n_neighbors = 3)
Score_train,Score_val=Train_ByValidation(X_train,y_train,X_val,y_val,clf)
results.append(['KNeighborsClassifier',Score_train,Score_val])

# Support Vector Machines
clf = SVC()
Score_train,Score_val=Train_ByValidation(X_train,y_train,X_val,y_val,clf)
results.append(['SVC',Score_train,Score_val])

# Gaussian Naive Bayes
clf = GaussianNB()
Score_train,Score_val=Train_ByValidation(X_train,y_train,X_val,y_val,clf)
results.append(['GaussianNB',Score_train,Score_val])

# Ada Boost Classifier
clf=AdaBoostClassifier(n_estimators=100)
Score_train,Score_val=Train_ByValidation(X_train,y_train,X_val,y_val,clf)
results.append(['AdaBoostClassifier',Score_train,Score_val])

# Extra Trees Classifier
clf=ExtraTreesClassifier()
Score_train,Score_val=Train_ByValidation(X_train,y_train,X_val,y_val,clf)
results.append(['ExtraTreesClassifier',Score_train,Score_val])


Results_df = pd.DataFrame(results,columns=['Classifier', 'Train Acc','Val Acc']).sort_values(by=['Val Acc'],ascending=False)
Results_df.head(15)


Unnamed: 0,Classifier,Train Acc,Val Acc
9,AdaBoostClassifier,87.44,83.82
8,GaussianNB,78.92,82.92
1,RandomForestClassifier,98.43,80.9
10,ExtraTreesClassifier,98.43,78.65
2,DecisionTreeClassifier,98.43,76.63
0,LogisticRegression,71.52,72.13
5,Perceptron,71.52,70.34
6,KNeighborsClassifier,82.06,70.34
3,SGDClassifier,70.63,68.09
7,SVC,93.72,67.64


# Submission to find the Test Score

In [30]:
test_df=pd.read_csv('./input/test.csv')

clf=AdaBoostClassifier(n_estimators=100)

clf=clf.fit(X_train,y_train)
Y_Pred_test = clf.predict(X_test)

submission = pd.DataFrame({"PassengerId": test_df["PassengerId"],"Survived": Y_Pred_test})
submission.to_csv('./output/submission.csv', index=False)

print("Mission was done!!!")

Mission was done!!!


In [31]:
clf

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=100, random_state=None)

This classifire can be tuned (Hyper-parameters tuning) to acheive better results.