**1.UNDERSTANDING DATA**

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score,roc_auc_score,roc_curve,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
ds=pd.read_csv('../input/titanic/train.csv')
dt=pd.read_csv('../input/titanic/test.csv')

*FEATURES:*

*PassengerId : Passenger's id*

*Survived : Survival (0 = No, 1 = Yes)*

*Pclass : Ticket class (1 = 1st(upper), 2 = 2nd(middle), 3 = 3rd(lower))*

*Name : Name*

*Sex : Sex (Male, Female)*

*Age : Age in years*

*SibSp : # of siblings / spouses aboard the Titanic*

*Parch : # of parents / children aboard the Titanic*

*Ticket : Ticket number*

*Fare : Passenger fare*

*Cabin : Cabin number*

*Embarked : Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)*

In [None]:
ds.head()

In [None]:
dt.head()

In [None]:
ds.shape

In [None]:
dt.shape

In [None]:
type(ds)

In [None]:
type(dt)

In [None]:
ds.size

In [None]:
dt.size

In [None]:
ds.describe()

In [None]:
dt.describe()

In [None]:
ds.info()

In [None]:
dt.info()

*There are some missing values in both train and test datasets*

In [None]:
ds.isnull().sum()

In [None]:
dt.isnull().sum()

**2. DATA VISUALIZATION**

In [None]:
def bar_chart(feature):
    survived = ds[ds['Survived']==1][feature].value_counts()
    dead = ds[ds['Survived']==0][feature].value_counts()
    df = pd.DataFrame([survived,dead])
    df.index = ['Survived','Dead']
    df.plot(kind='bar',stacked=True, figsize=(10,5))

In [None]:
sns.countplot(x='Survived',data = ds)
plt.show()

From the above visualization, death rate seems to be higher that survival rate

In [None]:
sns.countplot(x='Pclass',data = ds)
plt.show()

In [None]:
bar_chart('Pclass')

From the above visualization, First Class's survival rate is higher than others and Third Class's death rate is higher than others.

In [None]:
sns.countplot(x='Sex',data = ds)
plt.show()

In [None]:
bar_chart('Sex')

From the above visualization, Female's survival rate is higher than that of male's

In [None]:
sns.countplot(x='Embarked',data = ds)
plt.show()

In [None]:
bar_chart('Embarked')

From the above visualization, a person aboarded from C is slightly most likely survived and a person aboarded from Q is most likely dead and a person aboarded from S is most likely dead

In [None]:
sns.countplot(x='SibSp',data = ds)
plt.show()

In [None]:
bar_chart('SibSp')

Frome the above visualization, a person aboarded with more than 2 siblings or spouse more likely survived

Frome the above visualization, a person aboarded without siblings or spouse more likely dead

In [None]:
sns.countplot(x='Parch',data = ds)
plt.show()

In [None]:
bar_chart('Parch')

Frome the above visualization, a person aboarded with more than 2 parents or children more likely survived and a person aboarded alone more likely dead

In [None]:
sns.distplot(ds['Age'])
plt.show()

In [None]:
sns.distplot(ds['Fare'])
plt.show()

Graph is not uniform.

In [None]:
fare=ds.pivot_table(index='Pclass',values='Fare')
fare.plot(kind='bar')
plt.xlabel('Pclass')
plt.ylabel('Average Fare')
plt.xticks(rotation=0)
plt.title("Avg Fare")
plt.show()

The fare for first class ticket is so much higher than other classes

**3. FEATURE ENGINEERING**

*Feature engineering is the process of selecting, manipulating, and transforming raw data into features that can be used in supervised learning. Feature engineering, in simple terms, is the act of converting raw observations into desired features using statistical or machine learning approaches.*

In [None]:
ds['TitlePrefix'] = ds['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
dt['TitlePrefix'] = dt['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [None]:
print("TRAINING SET")
print(ds['TitlePrefix'].value_counts())

In [None]:
print("TESTING SET")
print(dt['TitlePrefix'].value_counts())

In [None]:
#training set
ds['TitlePrefix'] = ds['TitlePrefix'].replace(['Capt', 'Dr', 'Major', 'Rev', 'Col'], 'Officials')
ds['TitlePrefix'] = ds['TitlePrefix'].replace(['Lady', 'Countess', 'Don', 'Sir', 'Jonkheer', 'Dona'], 'High_Class')
ds['TitlePrefix'] = ds['TitlePrefix'].replace(['Mlle', 'Ms'], 'Miss')
ds['TitlePrefix'] = ds['TitlePrefix'].replace(['Mme'], 'Mrs')

#testing set
dt['TitlePrefix'] = dt['TitlePrefix'].replace(['Capt', 'Dr', 'Major', 'Rev', 'Col'], 'Officials')
dt['TitlePrefix'] = dt['TitlePrefix'].replace(['Lady', 'Countess', 'Don', 'Sir', 'Jonkheer', 'Dona'], 'High_Class')
dt['TitlePrefix'] = dt['TitlePrefix'].replace(['Mlle', 'Ms'], 'Miss')
dt['TitlePrefix'] = dt['TitlePrefix'].replace(['Mme'], 'Mrs')

Title map

"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Officials": 4, "High_Class": 5

In [None]:
tmapping = {"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Officials": 4, "High_Class": 5}
ds['TitlePrefix'] = ds['TitlePrefix'].map(tmapping)
dt['TitlePrefix'] = dt['TitlePrefix'].map(tmapping)

In [None]:
ds.drop('Name', axis=1, inplace=True)
dt.drop('Name', axis=1, inplace=True)

**4. HANDLING MISSING VALUES AND MAPPING**

*MAPPING: It is the process of mapping a categorical or string values into numberical values.*

**4.1 AGE**

In [None]:
ds["Age"].fillna(ds.groupby("TitlePrefix")["Age"].transform("mean").round(1), inplace=True)
dt["Age"].fillna(dt.groupby("TitlePrefix")["Age"].transform("mean").round(1), inplace=True)

In [None]:
#train set
ds.loc[ds['Age'] <= 16, 'Age'] = 0
ds.loc[(ds['Age'] > 16) & (ds['Age'] <= 25), 'Age'] = 1
ds.loc[(ds['Age'] > 25) & (ds['Age'] <= 35), 'Age'] = 2
ds.loc[(ds['Age'] > 35) & (ds['Age'] <= 60), 'Age'] = 3
ds.loc[ ds['Age'] > 60, 'Age'] = 4
#test set
dt.loc[dt['Age'] <= 16, 'Age'] = 0
dt.loc[(dt['Age'] > 16) & (dt['Age'] <= 25), 'Age'] = 1
dt.loc[(dt['Age'] > 25) & (dt['Age'] <= 35), 'Age'] = 2
dt.loc[(dt['Age'] > 35) & (dt['Age'] <= 60), 'Age'] = 3
dt.loc[ dt['Age'] > 60, 'Age'] = 4

CHILD 0

TEEN 1

ADULT 2

MIDDLE AGE 3

SENIOR 4

**4.2 EMBARKED**

In [None]:
P1 = ds[ds['Pclass']==1]['Embarked'].value_counts()
P2 = ds[ds['Pclass']==2]['Embarked'].value_counts()
P3 = ds[ds['Pclass']==3]['Embarked'].value_counts()
df = pd.DataFrame([P1,P2,P3])
df.index = ['1st class','2nd class', '3rd class']
df.plot(kind='bar',stacked=True, figsize=(10,5))

*Since "S"(which is label 0) has more than 50% in each class, the null values are replaced to "S"*

In [None]:
#train set
ds['Embarked']=ds['Embarked'].fillna('S')
#test set
dt['Embarked']=dt['Embarked'].fillna('S')

In [None]:
mapping={"S":0,"C":1,"Q":2}
ds['Embarked']=ds['Embarked'].map(mapping)
dt['Embarked']=dt['Embarked'].map(mapping)

**4.3 CABIN**

More than half of the cabin values are null. So drop that column.

In [None]:
ds=ds.drop(columns=['Cabin'],axis=1)
dt=dt.drop(columns=['Cabin'],axis=1)

**4.4 FARE**

In [None]:
ds["Fare"].fillna(ds.groupby("Pclass")["Fare"].transform("mean"), inplace=True)
dt["Fare"].fillna(dt.groupby("Pclass")["Fare"].transform("mean"), inplace=True)

LOG TRANSFORMATION

In [None]:
ds['Fare']=np.log(ds['Fare']+1)
sns.distplot(ds['Fare'])
plt.show()

In [None]:
#train set
ds.loc[ds['Fare'] <= 15, 'Fare'] = 0
ds.loc[(ds['Fare'] > 15) & (ds['Fare'] <= 30), 'Fare'] = 1
ds.loc[(ds['Fare'] > 30) & (ds['Fare'] <= 90), 'Fare'] = 2
ds.loc[ ds['Fare'] > 90, 'Fare'] = 3
#test set
dt.loc[dt['Fare'] <= 15, 'Fare'] = 0
dt.loc[(dt['Fare'] > 15) & (dt['Fare'] <= 30), 'Fare'] = 1
dt.loc[(dt['Fare'] > 30) & (dt['Fare'] <= 90), 'Fare'] = 2
dt.loc[ dt['Fare'] > 90, 'Fare'] = 3

FARE=0 (<=15)

FARE=1 (between 15 and 30)

FARE=2 (between 30 and 90)

FARE=3 (>90)

**4.5 SEX**

In [None]:
mapping3={'male':0,'female':1}
ds['Sex']=ds['Sex'].map(mapping3)
dt['Sex']=dt['Sex'].map(mapping3)

*Thus, all the missing values are handled efficiently.*

In [None]:
ds.isnull().sum()

In [None]:
dt.isnull().sum()

In [None]:
#removing unnecessary features
ds.drop('Ticket', axis=1, inplace=True)
ds.drop('PassengerId', axis=1, inplace=True)
dt.drop('Ticket', axis=1, inplace=True)
dt.drop('PassengerId', axis=1, inplace=True)

In [None]:
#target
target=ds['Survived']
ds.drop('Survived', axis=1, inplace=True)

**4.1 CORRELATION**

In [None]:
data=pd.read_csv('../input/titanic/train.csv')
corr=data.corr()
plt.figure(figsize=(15,15))
sns.heatmap(corr,annot= True,cmap='coolwarm')
plt.show()

In [None]:
def correlation(data, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = data.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(data, 0.5)
print("Length of corr_features : ",len(set(corr_features)))
print("Corr_features : ",corr_features)

**5. MODELING**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(ds, target, test_size=0.25, stratify=target,random_state=0)

In [None]:
print(len(X_test),len(y_test))
print(y_train.shape)
print(y_test.shape)

In [None]:
#Feature scaling
scale=StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

**5.1.1 LOGISTIC REGRESSION**

In [None]:
log=LogisticRegression()
log.fit(X_train,y_train)
y_pred_log = log.predict(X_test)
print("Training score of Logistic Regression is : {:.2f}".format(log.score(X_train,y_train)*100))
acc_log=accuracy_score(y_test,y_pred_log)*100
print("Accuracy Score : {:.2f}%".format(acc_log))
print("Confusion matrix : \n{}".format(confusion_matrix(y_test,y_pred_log)))
print("Classification report is : \n{}".format(classification_report(y_test,y_pred_log)))

**5.1.2 KNN CLASSIFIER**

In [None]:
acc_values=[]
neighbors=np.arange(3,15)
for k in neighbors:
  classifier=KNeighborsClassifier(n_neighbors=k,metric='minkowski')
  classifier.fit(X_train,y_train)
  y_pred_knn=classifier.predict(X_test)
  acc_knn=accuracy_score(y_test,y_pred_knn)
  acc_values.append(acc_knn)
print("Accuracy values:\n",acc_values)
print("Maximum value among all : ",max(acc_values))

In [None]:
plt.plot(neighbors,acc_values,'o-')
plt.xlabel('k_value')
plt.ylabel('Accuracy')

*From the above graph we can see that the highest accuracy value is for k=14*

In [None]:
knn=KNeighborsClassifier(n_neighbors=14,metric='minkowski')
knn.fit(X_train,y_train)
y_pred_knn=classifier.predict(X_test)
print("Training score of KNN is: {:.2f}".format(knn.score(X_train,y_train)*100))
acc_knn=accuracy_score(y_test,y_pred_knn)*100
print("Accuracy of KNN is: {:.2f}%".format(acc_knn))
print("Confusion matrix of KNN is: \n{}".format(confusion_matrix(y_test,y_pred_knn)))
print("Classification report is : \n{}".format(classification_report(y_test,y_pred_knn)))

**5.1.3 SVM LINEAR**

In [None]:
svml=SVC(kernel='linear')
svml.fit(X_train,y_train)
y_pred_svm=svml.predict(X_test)
print("Training score of SVM is: {:.2f}".format(svml.score(X_train,y_train)*100))
acc_svm=accuracy_score(y_test,y_pred_svm)*100
print("Accuracy of SVM is: {:.2f}%".format(acc_svm))
print("Confusion matrix of SVM is: \n{}".format(confusion_matrix(y_test,y_pred_svm)))
print("Classification report is : \n{}".format(classification_report(y_test,y_pred_svm)))

**5.1.4 SVM POLYNOMIAL**

In [None]:
svm_poly=SVC(kernel='poly',degree=3)
svm_poly.fit(X_train,y_train)
y_pred_svm_poly=svm_poly.predict(X_test)
print("Training score of SVM is: {:.2f}".format(svm_poly.score(X_train,y_train)*100))
acc_svm_poly=accuracy_score(y_test,y_pred_svm_poly)*100
print("Accuracy of SVM is: {:.2f}%".format(acc_svm_poly))
print("Confusion matrix of SVM is: \n{}".format(confusion_matrix(y_test,y_pred_svm_poly)))
print("Classification report is : \n{}".format(classification_report(y_test,y_pred_svm_poly)))

**5.1.5 SVM RADIAL**

In [None]:
svm1=SVC(kernel='rbf')
svm1.fit(X_train,y_train)
y_pred_svm1=svm1.predict(X_test)
print("Training score of SVM is: {:.2f}".format(svm1.score(X_train,y_train)*100))
acc_svm1=accuracy_score(y_test,y_pred_svm1)*100
print("Accuracy of SVM is: {:.2f}%".format(acc_svm1))
print("Confusion matrix of SVM is: \n{}".format(confusion_matrix(y_test,y_pred_svm1)))
print("Classification report is : \n{}".format(classification_report(y_test,y_pred_svm1)))

**5.1.6 ADA BOOST CLASSIFIER**

In [None]:
adb=AdaBoostClassifier()
adb.fit(X_train,y_train)
y_pred_adb = adb.predict(X_test)
print("Training score of Ada Boost Classifier is : {:.2f}".format(adb.score(X_train,y_train)*100))
acc_adb=accuracy_score(y_test,y_pred_adb)*100
print("Accuracy of Ada Boost Classifier is : {:.2f}%".format(acc_adb))
print("Confusion matrix of Ada Boost Classifier is : \n{}".format(confusion_matrix(y_test,y_pred_adb)))
print("Classification report is : \n{}".format(classification_report(y_test,y_pred_adb)))

**5.1.7 XG BOOST CLASSIFIER**

In [None]:
xg=XGBClassifier()
xg.fit(X_train,y_train)
y_pred_xg=xg.predict(X_test)
print("Training score of XGB Classifier is : {}".format(xg.score(X_train,y_train)*100))
acc_xg=accuracy_score(y_test,y_pred_xg)*100
print("Accuracy of XGB Classifier is: {}".format(acc_xg))
print("Confusion matrix of XGB Classifier is: \n{}".format(confusion_matrix(y_test,y_pred_xg)))
print("Classification report is : \n{}".format(classification_report(y_test,y_pred_xg)))

**5.1.8 GRADIENT BOOSTING CLASSIFIER**

In [None]:
gb=GradientBoostingClassifier()
gb.fit(X_train,y_train)
y_pred_gb=gb.predict(X_test)
print("Training score of GB Classifier is : {}".format(gb.score(X_train,y_train)*100))
acc_gb=accuracy_score(y_test,y_pred_gb)*100
print("Accuracy of GB Classifier is: {}".format(acc_gb))
print("Confusion matrix of GB Classifier is: \n{}".format(confusion_matrix(y_test,y_pred_gb)))
print("Classification report is : \n{}".format(classification_report(y_test,y_pred_gb)))

**5.1.9 GUASSIAN NAIVE BAYES**

In [None]:
nb=GaussianNB()
nb.fit(X_train,y_train)
y_pred_nb=nb.predict(X_test)
print("Training score of GaussianNB is : {}".format(nb.score(X_train,y_train)*100))
acc_nb=accuracy_score(y_test,y_pred_nb)*100
print("Accuracy of Naive Bayes Classifier is : {}%".format(acc_nb))
print("Confusion matrix of Naive Bayes Classifier is : \n{}".format(confusion_matrix(y_test,y_pred_nb)))
print("Classification report is : \n{}".format(classification_report(y_test,y_pred_nb)))

**5.1.10 BERNOULLI NAIVE BAYES**

In [None]:
bnb=BernoulliNB()
bnb.fit(X_train,y_train)
y_pred_bnb=bnb.predict(X_test)
print("Training score of GaussianNB is : {}".format(bnb.score(X_train,y_train)*100))
acc_bnb=accuracy_score(y_test,y_pred_bnb)*100
print("Accuracy of Naive Bayes Classifier is : {}%".format(acc_bnb))
print("Confusion matrix of Naive Bayes Classifier is : \n{}".format(confusion_matrix(y_test,y_pred_bnb)))
print("Classification report is : \n{}".format(classification_report(y_test,y_pred_bnb)))

**5.1.11 DECISION TREE**

In [None]:
dect=DecisionTreeClassifier(criterion="entropy")
dect.fit(X_train,y_train)
y_pred_dt=dect.predict(X_test)
print("Training score of Decision Tree Classifier is : {}".format(dect.score(X_train,y_train)*100))
acc_dt=accuracy_score(y_test,y_pred_dt)*100
print("Accuracy score is : {}%".format(acc_dt))
print("Confusion matrix : \n{}".format(confusion_matrix(y_test,y_pred_dt)))
print("Classification report is : \n{}".format(classification_report(y_test,y_pred_dt)))

**5.1.12 RANDOM FOREST CLASSIFIER**

In [None]:
RF=RandomForestClassifier(n_estimators=300)
RF.fit(X_train,y_train)
y_pred_RF=RF.predict(X_test)
print("Training score of Random Forest Classifier is : {}".format(RF.score(X_train,y_train)*100))
acc_rf=accuracy_score(y_test,y_pred_RF)*100
print("Accuracy of Random Forest Classifier is: {}%".format(acc_rf))
print("Confusion matrix of Random Forest Classifier is: \n{}".format(confusion_matrix(y_test,y_pred_RF)))
print("Classification report is : \n{}".format(classification_report(y_test,y_pred_RF)))

**5.2 COMPARISON OF CLASSIFIERS**

In [None]:
classifiers=["LogisticRegression","KNNClassifier","SVM-Linear","SVM-Poly","SVM-Radial","ADABoost","XG","GradientBoosting","GuassianNB","BernoulliNB","DecisionTree","RandomForest"
]
accuracy_=[acc_log,acc_knn,acc_svm,acc_svm_poly,acc_svm1,acc_adb,acc_xg,acc_gb,acc_nb,acc_bnb,acc_dt,acc_rf]
df_ac=pd.DataFrame({'Model':classifiers,"Accuracy":accuracy_})
px.histogram(data_frame=df_ac,x="Model",y="Accuracy",color=classifiers)

*Gradient Boosting Classifier seems to be the best model based on accuracy score.*

**5.3 K-FOLD CROSS VALIDATION**

In [None]:
K_Fold = KFold(n_splits=10)
abc=[]
Acc = []
Classifiers = ["Logistic Regression","KNNClassifier","SVM-Linear","SVM-Poly","SVM-Radial","ADABoost","XG","GradientBoosting","GuassianNB","BernoulliNB","DecisionTree","RandomForest"
]
Models = [LogisticRegression(),KNeighborsClassifier(n_neighbors=7,metric='minkowski'),SVC(kernel='linear'),SVC(kernel='poly',degree=3),SVC(kernel='rbf'),AdaBoostClassifier(),XGBClassifier(),GradientBoostingClassifier(),GaussianNB(),GaussianNB(),DecisionTreeClassifier(criterion="entropy"),RandomForestClassifier(n_estimators=300)]
for i in Models:
    model = i
    CV_Result = cross_val_score(model, X_train, y_train, cv=K_Fold, scoring="accuracy")
    abc.append(CV_Result.mean())
    Acc.append(CV_Result)

CV_ModelData = pd.DataFrame(abc, index=Classifiers)
CV_ModelData.columns = ["CV Mean"]
CV_ModelData

*Result : Selected Gradient Boosting Classifier as the best model*

**6. TESTING**

In [None]:
gb=GradientBoostingClassifier()
gb.fit(ds,target)
predicted=knn.predict(dt)

In [None]:
dt=pd.read_csv('../input/titanic/test.csv')
res=pd.DataFrame({"PassengerId":dt["PassengerId"],"Survived":predicted})
res.to_csv('g_submission.csv',index=False)

In [None]:
res=pd.read_csv('g_submission.csv')
res.head()