## Data Importation

In [110]:
import pandas as pd
import numpy as np

train = pd.read_csv('./Dataset/train.csv')
test = pd.read_csv('./Dataset/test.csv')

In [111]:
train.shape

(891, 12)

In [112]:
test.shape
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [113]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [114]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


let's see infos 

In [115]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


We may want to know how many people survived in each class. We can use the groupby method to quickly do this:

In [116]:
pclass = train.groupby('Pclass').Survived.value_counts() 
pclass

Pclass  Survived
1       1           136
        0            80
2       0            97
        1            87
3       0           372
        1           119
Name: count, dtype: int64

as we can see , the statistics show that the survival rate of the passengers in the first class is higher than the second and third class. we can exploit this info 

## Feature Engineering

Let's create some additional features from the existing ones ( feature engineering ) . first let's add , family size

In [141]:
def feature_engineering (train, test):
    train['Family_Size']=train['SibSp']+train['Parch']
    test['Family_Size']=test['SibSp']+test['Parch']
    train["Level"]=(1/train["Pclass"])*train["Fare"]
    test["Level"]=(1/test["Pclass"])*test["Fare"]
    train['Deck']=train['Cabin'].str[0]
    test['Deck']=test['Cabin'].str[0]

In [142]:
train_df = train.copy()
test_df = test.copy()
feature_engineering(train_df, test_df)

In [143]:
corr_matrix = train_df.corr(numeric_only=True)

In [144]:
corr_matrix["Survived"].sort_values(ascending=False)

Survived       1.000000
Level          0.267627
Fare           0.257307
Parch          0.081629
Family_Size    0.016639
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Name: Survived, dtype: float64

As we can see , it seems that the Fare and Pclass are the most correlated attributes with the Survived attribute.

let's prepare our train set now

## Stratified Sampling



Now let's use Startified sampling from scikit learn , basically strata means spliting your dataset into many 

In [156]:
from sklearn.model_selection import train_test_split
X= train_df.drop(['Survived'], axis=1)
y= train_df['Survived']
train_X, val_X ,train_y, val_y= train_test_split(X,y, test_size=0.2, stratify=train_df["Survived"], random_state=42)

## Pipeline Creation

let's create a pipline that prepares the data for training. but before let devise our dataset into numerical and categorical sub datasets

In [146]:
num_features=['Age','SibSp','Parch','Fare',"Pclass","Level","Family_Size"]
cat_features=['Sex',"Deck"]

In [147]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
num_pipeline = Pipeline([
 ('imputer', SimpleImputer(strategy="median")),
 ('std_scaler', StandardScaler()),
 ])

In [148]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

full_pipeline = ColumnTransformer([("num_pipeline", num_pipeline, num_features),("cat", OneHotEncoder(), cat_features)])
train_prepared = full_pipeline.fit_transform(train_X)
val_prepared = full_pipeline.transform(val_X)
test_prepared = full_pipeline.transform(test_df) 

## Selecting Best Model

there are many binary classification estimators , we want to choose the best one 

In [149]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
classifiers = [LogisticRegression(),RandomForestClassifier(),KNeighborsClassifier()]

let's evaluate the models without performing any hyperparameter tuning

In [150]:
from numpy import mean
from sklearn.metrics import roc_auc_score,accuracy_score,recall_score,precision_score
for classifier in classifiers:
    classifier.fit(train_prepared, train_y)
    pred= classifier.predict(val_prepared)
    print(classifier,"roc_auc",roc_auc_score(val_y, pred, average='weighted'))
    print(classifier,"accuracy",accuracy_score(val_y, pred))
    print(classifier,"precision",precision_score(val_y, pred))
    print(classifier,"recall",recall_score(val_y, pred))
    
    


LogisticRegression() roc_auc 0.7860342555994729
LogisticRegression() accuracy 0.8100558659217877
LogisticRegression() precision 0.7966101694915254
LogisticRegression() recall 0.6811594202898551
RandomForestClassifier() roc_auc 0.7887351778656126
RandomForestClassifier() accuracy 0.8100558659217877
RandomForestClassifier() precision 0.7868852459016393
RandomForestClassifier() recall 0.6956521739130435
KNeighborsClassifier() roc_auc 0.7850461133069829
KNeighborsClassifier() accuracy 0.7988826815642458
KNeighborsClassifier() precision 0.746268656716418
KNeighborsClassifier() recall 0.7246376811594203


From the previous results , we can say that the logistic regression is the best model to use 

## Hyperparameters Fine Tuning

In [157]:
from sklearn.model_selection import RandomizedSearchCV
logistic = LogisticRegression()
final_train_X = full_pipeline.fit_transform(X)
final_train_y = y
param_dist = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': np.logspace(-4, 4, 50),
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': list(range(100, 5000, 100)),
    'fit_intercept': [True, False],
    'intercept_scaling': np.linspace(1, 10, 50),
    'class_weight': [None, 'balanced'],
    'warm_start': [True, False],
    'l1_ratio': np.linspace(0, 1, 20)
}
rand_search = RandomizedSearchCV(estimator=logistic, param_distributions=param_dist,n_iter=200, cv= 10 ,scoring="accuracy", verbose=3, n_jobs=-1)
rand_search.fit(final_train_X, final_train_y)


Fitting 10 folds for each of 200 candidates, totalling 2000 fits


690 fits failed out of a total of 2000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
170 fits failed with the following error:
Traceback (most recent call last):
  File "e:\programming\AI\Titanic_Disaster_Detection\Project1\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "e:\programming\AI\Titanic_Disaster_Detection\Project1\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "e:\programming\AI\Titanic_Disaster_Detection\Project1\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solver(se

In [158]:
rand_search.best_params_

{'warm_start': True,
 'solver': 'saga',
 'penalty': 'elasticnet',
 'max_iter': 4600,
 'l1_ratio': 0.0,
 'intercept_scaling': 5.040816326530613,
 'fit_intercept': False,
 'class_weight': None,
 'C': 7.9060432109076855}

In [159]:
rand_search.best_score_

0.8024719101123596

In [160]:
model = rand_search.best_estimator_

Now we will use the stratified kfold cross validation method to evaluate our model

In [161]:
from sklearn.model_selection import StratifiedKFold
def stratified_cross_val(model, X, y, n_splits=20):
    accuracy=[]
    skf= StratifiedKFold(n_splits=n_splits)
    final_train_X = pd.DataFrame(X)
    skf.get_n_splits(final_train_X, y)
    for train_index, test_index in skf.split(X, y):
        train_X, val_X = X[train_index], X[test_index]
        train_y, val_y = y[train_index], y[test_index]
        model.fit(train_X, train_y)
        pred= model.predict(val_X)
        score = accuracy_score(val_y, pred)
        accuracy.append(score)
    return mean(accuracy)
        

from what we can see , the logistic regression classifier is the best performs better than the other models

In [162]:
print(stratified_cross_val(model, final_train_X, final_train_y))

0.7991161616161617


model.fit(train)

In [163]:
model.fit(final_train_X, final_train_y)
predictions = model.predict(test_prepared)
predictions

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

let's now save the submission 

In [164]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [165]:
output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
