## Data Importation

In [93]:
import pandas as pd
import numpy as np

train = pd.read_csv('./Dataset/train.csv')
test = pd.read_csv('./Dataset/test.csv')

In [94]:
train.shape

(891, 12)

In [95]:
test.shape
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [96]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [97]:
train["Name"]

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [98]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


let's see infos 

In [99]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


We may want to know how many people survived in each class. We can use the groupby method to quickly do this:

In [100]:
pclass = train.groupby('Pclass').Survived.value_counts() 
pclass

Pclass  Survived
1       1           136
        0            80
2       0            97
        1            87
3       0           372
        1           119
Name: count, dtype: int64

as we can see , the statistics show that the survival rate of the passengers in the first class is higher than the second and third class. we can exploit this info 

## Feature Engineering

Let's create some additional features from the existing ones ( feature engineering ) . first let's add , family size

In [101]:
def feature_engineering (train, test):
    title_pattern = r',\s(.*?\.)\s'

    # Extract the title using str.extract
    train['Title'] = train['Name'].str.extract(title_pattern)
    test['Title'] = test['Name'].str.extract(title_pattern)
    train['Family_Size']=train['SibSp']+train['Parch']
    test['Family_Size']=test['SibSp']+test['Parch']
    train["Level"]=(train["Pclass"])*train["Fare"]*train["Age"]
    test["Level"]=(test["Pclass"])*test["Fare"]*test["Age"]

In [102]:
train_df = train.copy()
test_df = test.copy()
feature_engineering(train_df, test_df)

In [103]:
train_df["Title"].value_counts()

Title
Mr.              517
Miss.            182
Mrs.             125
Master.           40
Dr.                7
Rev.               6
Mlle.              2
Major.             2
Col.               2
the Countess.      1
Capt.              1
Ms.                1
Sir.               1
Lady.              1
Mme.               1
Don.               1
Jonkheer.          1
Name: count, dtype: int64

In [104]:
corr_matrix = train_df.corr(numeric_only=True)

In [105]:
corr_matrix["Survived"].sort_values(ascending=False)

Survived       1.000000
Fare           0.257307
Level          0.173592
Parch          0.081629
Family_Size    0.016639
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Name: Survived, dtype: float64

As we can see , it seems that the Fare and Pclass are the most correlated attributes with the Survived attribute.

let's prepare our train set now

## Stratified Sampling



Now let's use Startified sampling from scikit learn , basically strata means spliting your dataset into many 

In [106]:
from sklearn.model_selection import train_test_split
X= train_df.drop(['Survived'], axis=1)
y= train_df['Survived']
train_X, val_X ,train_y, val_y= train_test_split(X,y, test_size=0.2, stratify=train_df["Survived"], random_state=42)

## Pipeline Creation

let's create a pipline that prepares the data for training. but before let devise our dataset into numerical and categorical sub datasets

In [107]:
num_features=["Age",'Fare',"SibSp","Parch","Level"]
cat_features=['Sex',"Embarked"]
ordinal_features=["Pclass"]

then to fix the problem of having some categories in the training set and others in the test set , we'll concatenate all of our data , and fit it 

In [108]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler ,OneHotEncoder
from sklearn.impute import SimpleImputer
num_pipeline = Pipeline([
 ('imputer', SimpleImputer(strategy="most_frequent")),
 ('std_scaler', StandardScaler()),
 ])
cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("cat_encoder", OneHotEncoder(sparse_output=False)),
    ])
ordinal_pipeline = Pipeline([("ordinal_encoder", OrdinalEncoder()),("imputer", SimpleImputer(strategy="most_frequent"))    
])

In [109]:
from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([("num_pipeline", num_pipeline, num_features),("cat_pipeline", cat_pipeline, cat_features),("ordinal_pipeline", ordinal_pipeline, ordinal_features)])
train_prepared = full_pipeline.fit_transform(train_X)
test_prepared = full_pipeline.transform(test_df)
val_prepared = full_pipeline.transform(val_X)


## Selecting Best Model

there are many binary classification estimators , we want to choose the best one 

In [110]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
classifiers = [LogisticRegression(),RandomForestClassifier(),KNeighborsClassifier(),SVC()]

let's evaluate the models without performing any hyperparameter tuning

In [111]:
from numpy import mean
from sklearn.metrics import roc_auc_score,accuracy_score,recall_score,precision_score
for classifier in classifiers:
    classifier.fit(train_prepared, train_y)
    pred= classifier.predict(val_prepared)
    print(classifier,"roc_auc",roc_auc_score(val_y, pred, average='weighted'))
    print(classifier,"accuracy",accuracy_score(val_y, pred))
    print(classifier,"precision",precision_score(val_y, pred))
    print(classifier,"recall",recall_score(val_y, pred))
    
    


LogisticRegression() roc_auc 0.76699604743083
LogisticRegression() accuracy 0.7932960893854749
LogisticRegression() precision 0.7758620689655172
LogisticRegression() recall 0.6521739130434783
RandomForestClassifier() roc_auc 0.7841897233201581
RandomForestClassifier() accuracy 0.8044692737430168
RandomForestClassifier() precision 0.7741935483870968
RandomForestClassifier() recall 0.6956521739130435
KNeighborsClassifier() roc_auc 0.7914361001317525
KNeighborsClassifier() accuracy 0.8100558659217877
KNeighborsClassifier() precision 0.7777777777777778
KNeighborsClassifier() recall 0.7101449275362319
SVC() roc_auc 0.8096179183135704
SVC() accuracy 0.8324022346368715
SVC() precision 0.8305084745762712
SVC() recall 0.7101449275362319


From the previous results , we can say that the logistic regression is the best model to use 

## Hyperparameters Fine Tuning

first let's prepare the dataset for this 

In [112]:
final_train_X = full_pipeline.fit_transform(X)
final_train_y = y

In [113]:
from sklearn.model_selection import RandomizedSearchCV

# Assuming final_train_X and final_train_y are your training data
rf_classifier = RandomForestClassifier()

param_dist = {
    'n_estimators': [int(x) for x in np.linspace(start=200, stop=2000, num=10)],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [int(x) for x in np.linspace(10, 110, num=11)] + [None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
    'class_weight': [None, 'balanced', 'balanced_subsample'],
    'random_state': [42],  # Set a random state for reproducibility
}

rand_search = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_dist, n_iter=100, cv=5, scoring="accuracy", verbose=3, n_jobs=-1)
rand_search.fit(final_train_X, final_train_y)


Fitting 5 folds for each of 100 candidates, totalling 500 fits


180 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
74 fits failed with the following error:
Traceback (most recent call last):
  File "e:\programming\AI\Titanic_Disaster_Detection\Project1\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "e:\programming\AI\Titanic_Disaster_Detection\Project1\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "e:\programming\AI\Titanic_Disaster_Detection\Project1\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "e:\programming\AI\Titanic_Disaster_Detection\Project1\Lib\si

In [114]:
rand_search.best_params_

{'random_state': 42,
 'n_estimators': 600,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'log2',
 'max_depth': 10,
 'criterion': 'gini',
 'class_weight': None,
 'bootstrap': True}

In [115]:
rand_search.best_score_

0.8316678174628084

In [116]:
model = rand_search.best_estimator_

Now we will use the stratified kfold cross validation method to evaluate our model

In [117]:
from sklearn.model_selection import StratifiedKFold
def stratified_cross_val(model, X, y, n_splits=20):
    accuracy=[]
    skf= StratifiedKFold(n_splits=n_splits)
    final_train_X = pd.DataFrame(X)
    skf.get_n_splits(final_train_X, y)
    for train_index, test_index in skf.split(X, y):
        train_X, val_X = X[train_index], X[test_index]
        train_y, val_y = y[train_index], y[test_index]
        model.fit(train_X, train_y)
        pred= model.predict(val_X)
        score = accuracy_score(val_y, pred)
        accuracy.append(score)
    return mean(accuracy)
        

from what we can see , the classifier performs better than the other models

In [118]:
print(stratified_cross_val(model, final_train_X, final_train_y))

0.8282575757575759


model.fit(train)

In [119]:
model.fit(final_train_X, final_train_y)
predictions = model.predict(test_prepared)
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

let's now save the submission 

In [120]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [121]:
output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [122]:
output.tail()

Unnamed: 0,PassengerId,Survived
413,1305,0
414,1306,1
415,1307,0
416,1308,0
417,1309,0
