<center><u><b><h1 style="color:Red;">Titanic : Machine Learning from Disaster</u></b></h1></center>

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib 

<b>Loading Data</b>

In [2]:
df_train = pd.read_csv(r'C:\Kaggles\Titanic\Dataset\train.csv')
df_test = pd.read_csv(r'C:\Kaggles\Titanic\Dataset\test.csv')
test_ids = df_test["PassengerId"]

print(df_train.shape)
df_train.head(3)

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


<b>Data Cleaning</b>

In [3]:
def clean(data):
    data = data.drop(["Ticket", "PassengerId", "Name", "Cabin"], axis=1)
    
    cols = ["SibSp", "Parch", "Fare", "Age"]
    for col in cols:
        data[col].fillna(data[col].median(), inplace=True)
        
    data.Embarked.fillna("U", inplace=True)
    return data

df_train = clean(df_train)
df_test = clean(df_test)

<b> Data cleaned up. </b>

In [4]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
categorical_columns = ["Sex", "Embarked"]

for column in categorical_columns:
    df_train.loc[:, column] = le.fit_transform(df_train[column])
    df_test.loc[:, column] = le.fit_transform(df_test[column])

df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X = df_train.drop("Survived", axis="columns")
Y = df_train["Survived"]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.12, random_state=42)

In [6]:
clf = LogisticRegression(random_state=0, max_iter=10000).fit(x_train, y_train)
predictions = clf.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.8411214953271028

<b>K-Flod Cross Validation</b>

In [7]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv_split = ShuffleSplit(n_splits=10, test_size=0.12, random_state=False)
KFold_score_LR = cross_val_score(LogisticRegression(max_iter=1000), X, Y, cv=cv_split)
print(KFold_score_LR)
print("\nAverage Score : ", np.mean(KFold_score_LR))

[0.80373832 0.77570093 0.79439252 0.76635514 0.81308411 0.75700935
 0.79439252 0.82242991 0.86915888 0.85046729]

Average Score :  0.8046728971962616


<b>Grid Search CV </b>

Note : Here i have written this GridSearchCV for checking for different models ; but for SVC ; since different parameters are given hence this will take huge time to compile the results ; i have made some modifications now ; previously it took 15+ hrs for compilation ; hence don't run this ;)   XD !!


In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'logistic_regression' : {
            'model': LogisticRegression(max_iter=1000),
            'params': {
                'fit_intercept': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ["squared_error", "friedman_mse"],
                'splitter': ['best','random']
            }
        }, 
        'SVC':{
            'model' : SVC(), 
            'params' : {
                'C': [1, 10, 100],
                'gamma': [0.1, 0.01, 0.001],
                'kernel': ['rbf', 'poly', 'sigmoid'],
            }
        }, 
        'RandomForestRegressor' : {
            'model' : RandomForestRegressor(),
            'params' : {
                'n_estimators': [50, 100, 200],
                'max_depth': [None, 5, 10, 20],
                'min_samples_split': [2, 5, 10],
                'max_features': ['sqrt', 'log2']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=42)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X, Y)

<b>It's clearly evident that using Logistic Regression is a better way to analyse this problem. </b>

In [None]:
submission_preds = clf.predict(df_test)

In [None]:
df = pd.DataFrame({"PassengerId": test_ids.values,
                   "Survived": submission_preds,
                  })
df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [None]:
df.to_csv(r"C:\Kaggles\Titanic\Submissions\Submission_1.csv", index=False)