# #TITANIC SURVIVAL PREDICTION

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

## Load Dataset

In [2]:
import pandas as pd
df = pd.read_csv("Titanic-Dataset.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Perform basic data quality checks with Data Cleaning and processing

In [4]:
df.shape

(891, 12)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [7]:
df.duplicated().sum()

0

In [8]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
cat = list(df.columns[df.dtypes=='object'])

In [10]:
for i in cat:
    m = df[i].mode()[0]
    df[i].fillna(m,inplace=True)

In [11]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin            0
Embarked         0
dtype: int64

In [12]:
mn = df['Age'].mean()
df['Age'].fillna(mn,inplace=True)

In [13]:
df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [14]:
df.drop(columns=['Name','Ticket','Cabin'],axis=1,inplace=True)

In [15]:
df.nunique()

PassengerId    891
Survived         2
Pclass           3
Sex              2
Age             89
SibSp            7
Parch            7
Fare           248
Embarked         3
dtype: int64

In [16]:
print("After perform the Data Cleaning Process the size of data :", df.shape)

After perform the Data Cleaning Process the size of data : (891, 9)


In [17]:
## performing mean encoding
df['PassengerId encoded'] = round(df.groupby('PassengerId')['Survived'].transform('mean'),1)
df['Pclass encoded'] = round(df.groupby('Pclass')['Survived'].transform('mean'),1)
df['Sex encoded'] = round(df.groupby('Sex')['Survived'].transform('mean'),1)
df['Age encoded'] = round(df.groupby('Age')['Survived'].transform('mean'),1)
df['SibSp encoded'] = round(df.groupby('SibSp')['Survived'].transform('mean'),1)
df['Parch encoded'] = round(df.groupby('Parch')['Survived'].transform('mean'),1)
df['Fare encoded'] = round(df.groupby('Fare')['Survived'].transform('mean'),1)
df['Embarked encoded'] = round(df.groupby('Embarked')['Survived'].transform('mean'),1)

df.drop(columns=['PassengerId','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked'],inplace=True)
df['Survived'] = round(df['Survived'],1)
df.head()

Unnamed: 0,Survived,PassengerId encoded,Pclass encoded,Sex encoded,Age encoded,SibSp encoded,Parch encoded,Fare encoded,Embarked encoded
0,0,0.0,0.2,0.2,0.4,0.5,0.3,0.1,0.3
1,1,1.0,0.6,0.7,0.5,0.5,0.3,1.0,0.6
2,1,1.0,0.2,0.7,0.3,0.3,0.3,0.4,0.3
3,1,1.0,0.6,0.7,0.6,0.5,0.3,0.6,0.3
4,0,0.0,0.2,0.2,0.6,0.3,0.3,0.1,0.3


## Separate X(inputs) and Y(targets) features

In [18]:
X = df.drop(columns='Survived')
Y =df[['Survived']]

In [19]:
X.head()

Unnamed: 0,PassengerId encoded,Pclass encoded,Sex encoded,Age encoded,SibSp encoded,Parch encoded,Fare encoded,Embarked encoded
0,0.0,0.2,0.2,0.4,0.5,0.3,0.1,0.3
1,1.0,0.6,0.7,0.5,0.5,0.3,1.0,0.6
2,1.0,0.2,0.7,0.3,0.3,0.3,0.4,0.3
3,1.0,0.6,0.7,0.6,0.5,0.3,0.6,0.3
4,0.0,0.2,0.2,0.6,0.3,0.3,0.1,0.3


In [20]:
Y.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


In [21]:
X.shape

(891, 8)

In [22]:
Y.shape

(891, 1)

## Split the dataset into training data and testing data

In [23]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X,Y,test_size=0.25,random_state=21)

In [24]:
print(xtrain.shape)
print(ytrain.shape)

(668, 8)
(668, 1)


In [25]:
print(xtest.shape)
print(ytest.shape)

(223, 8)
(223, 1)


# Build the model

## Import necessary libraries

In [26]:
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import cross_val_score

In [27]:
# Apply SMOTE to the training data
### SMOTE (Synthetic Minority Over-sampling Technique): Creates synthetic samples for the minority class.
smote = SMOTE(random_state=24)
X_train_smote, y_train_smote = smote.fit_resample(xtrain, ytrain)

In [28]:
dct = {
    'Linear':LinearRegression(),
    'DecisionTree':DecisionTreeRegressor(),
    'RandomForest':RandomForestRegressor(),
    'KNN':KNeighborsRegressor(),
}

In [29]:
dct.items()

dict_items([('Linear', LinearRegression()), ('DecisionTree', DecisionTreeRegressor()), ('RandomForest', RandomForestRegressor()), ('KNN', KNeighborsRegressor())])

In [30]:
train_mse = []
train_r2 = []
test_mse = []
test_r2 = []
train_cv = []

for name,model in dct.items():
    # fit the model
    m = model.fit(X_train_smote,y_train_smote)
    ypred_train = m.predict(X_train_smote)
    ypred_test = m.predict(xtest)
    # calculate MSE
    mse_train = mean_squared_error(y_train_smote,ypred_train)
    mse_test = mean_squared_error(ytest,ypred_test)
    # caluclate R2
    r2_train = (r2_score(y_train_smote,ypred_train))*100
    r2_test = (r2_score(ytest,ypred_test))*100
    # calculate cross validated scores
    cv = cross_val_score(m,X_train_smote,y_train_smote,cv=5,scoring='r2')
    scores = (cv.mean())*100

    # add these values to the respective list to compare the output
    train_mse.append(mse_train)
    train_r2.append(r2_train)
    test_mse.append(mse_test)
    test_r2.append(r2_test)
    train_cv.append(scores)

    # print the results
    print(f'Scores for {name}')
    print("Training Scores")
    print(f'MSE:{train_mse}')
    print(f'R2:{train_r2}')
    print("Testing Scores")
    print(f'MSE:{test_mse}')
    print(f'R2:{test_r2}')
    print(f'CV:{train_cv}')
    print("===============================")

Scores for Linear
Training Scores
MSE:[2.7041164915368336e-30]
R2:[100.0]
Testing Scores
MSE:[2.7286349543940903e-30]
R2:[100.0]
CV:[100.0]
Scores for DecisionTree
Training Scores
MSE:[2.7041164915368336e-30, 0.0]
R2:[100.0, 100.0]
Testing Scores
MSE:[2.7286349543940903e-30, 0.0]
R2:[100.0, 100.0]
CV:[100.0, 100.0]
Scores for RandomForest
Training Scores
MSE:[2.7041164915368336e-30, 0.0, 0.0]
R2:[100.0, 100.0, 100.0]
Testing Scores
MSE:[2.7286349543940903e-30, 0.0, 0.0]
R2:[100.0, 100.0, 100.0]
CV:[100.0, 100.0, 100.0]
Scores for KNN
Training Scores
MSE:[2.7041164915368336e-30, 0.0, 0.0, 0.0]
R2:[100.0, 100.0, 100.0, 100.0]
Testing Scores
MSE:[2.7286349543940903e-30, 0.0, 0.0, 0.0]
R2:[100.0, 100.0, 100.0, 100.0]
CV:[100.0, 100.0, 100.0, 100.0]


In [31]:
res = {'Name':list(dct.keys()),
       'MSE Training Scores':train_mse,
       'MSE Testing Scores':test_mse,
       'R2 Training Scores':train_r2,
       'R2 Testing Scores':test_r2,
       'CV Training Scores':train_cv}

In [32]:
df_res = pd.DataFrame(res)
df_res.sort_values('CV Training Scores',ascending=False)

Unnamed: 0,Name,MSE Training Scores,MSE Testing Scores,R2 Training Scores,R2 Testing Scores,CV Training Scores
0,Linear,2.7041159999999997e-30,2.7286349999999998e-30,100.0,100.0,100.0
1,DecisionTree,0.0,0.0,100.0,100.0,100.0
2,RandomForest,0.0,0.0,100.0,100.0,100.0
3,KNN,0.0,0.0,100.0,100.0,100.0


In [33]:
model = RandomForestRegressor()
model.fit(X_train_smote,y_train_smote)

In [34]:
model.score(X_train_smote,y_train_smote)

1.0

In [35]:
model.score(xtest,ytest)

1.0

In [36]:
model2 = DecisionTreeRegressor()
model2.fit(X_train_smote,y_train_smote)

In [37]:
model2.score(X_train_smote,y_train_smote)

1.0

In [38]:
model2.score(xtest,ytest)

1.0

## Evaluate the models : Random Forest and Decision Tree

In [39]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [40]:
def eval_model(model,xtrain,ytrain):
    # Fit the model
    model.fit(X_train_smote,y_train_smote)
    # Predict the scores
    ypred_test = model.predict(xtest)
    # Calculate MSE,RMSE,MAE,R2 scores
    mse = mean_squared_error(ytest,ypred_test)
    rmse = mse**(1/2)
    mae = mean_absolute_error(ytest,ypred_test)
    r2 = r2_score(ytest,ypred_test)
    return mse,rmse,mae,r2

## Random Forest Evaluation Metrics

In [41]:
(MSE,RMSE,MAE,r2) = eval_model(model,X_train_smote,y_train_smote)
print(f'Evaluation Metrics: \nMSE: {MSE}\nRMSE:{RMSE}\nMAE:{MAE}\nR2:{r2}')

Evaluation Metrics: 
MSE: 0.0
RMSE:0.0
MAE:0.0
R2:1.0


## Decision Tree Evaluation Metrics

In [42]:
(MSE,RMSE,MAE,r2) = eval_model(model2,X_train_smote,y_train_smote)
print(f'Evaluation Metrics: \nMSE: {MSE}\nRMSE:{RMSE}\nMAE:{MAE}\nR2:{r2}')

Evaluation Metrics: 
MSE: 0.0
RMSE:0.0
MAE:0.0
R2:1.0


## Model Prediction

In [43]:
ypred_test = model.predict(xtest)
ypred_test

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0.,
       0., 1., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0.,
       0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0.,
       1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 1.,
       0., 1., 1., 1., 0., 1., 1., 0., 0., 1., 1., 1., 0., 1., 0., 0., 1.,
       0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1., 1.,
       0., 1., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 1.,
       1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 1.,
       0., 0., 0., 0., 1., 1., 1., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0.,
       1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       1., 1.])

In [44]:
df_final = xtest
df_final['survived_predict_score'] = ypred_test
df_final.tail()

Unnamed: 0,PassengerId encoded,Pclass encoded,Sex encoded,Age encoded,SibSp encoded,Parch encoded,Fare encoded,Embarked encoded,survived_predict_score
682,0.0,0.2,0.2,0.2,0.3,0.3,0.0,0.3,0.0
442,0.0,0.2,0.2,0.3,0.5,0.3,0.2,0.3,0.0
541,0.0,0.2,0.7,0.2,0.2,0.5,0.0,0.3,0.0
692,1.0,0.2,0.2,0.3,0.3,0.3,0.7,0.3,1.0
724,1.0,0.6,0.2,0.6,0.5,0.3,0.6,0.3,1.0


In [45]:
df_final.to_csv('survived_predict.csv',index=False)