# Titanic - Machine Learning from Disaster


This notebook is for Kaggle - Titanic Machine Learning from Disaster.<br> 
https://www.kaggle.com/competitions/titanic<br>
About Data:
This dataset is about the survival of passengers on the Titanic. 

### Import Libraries and packages

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

#machine learning libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Loading Data

In [3]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
combined_data = pd.concat([train_data, test_data], sort=False)

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/titanic/train.csv'

In [None]:
test_data.head()

In [None]:
test_data.isnull().sum()

### Exploratory Data Analysis

In [None]:
combined_data.head()

In [None]:
#describing the data
combined_data.describe(include=['O']) # that includes categorical

In [None]:
combined_data.describe(include=['number']).T #that includes numerical

### Check for Missing Values

In [None]:
combined_data.info()

In [None]:
combined_data.isnull().sum()

## Data Visualization

In [None]:
# percentage of survived groupby Sex
combined_data["Survived"].value_counts(normalize=True)

In [None]:
sns.countplot(x="Survived", data=combined_data)
plt.show()

In [None]:
#combined_data["Survived"].groupby(train_data["Sex"]).value_counts()

In [None]:
train_data["Survived"].groupby(train_data["Sex"]).value_counts(normalize=True)

In [None]:
sns.countplot(x="Survived",hue="Sex", data=combined_data)
plt.show()

In [None]:
sns.countplot(x="Survived",hue="Pclass", data=combined_data)
plt.show()

In [None]:
total_passengers = combined_data.groupby('Pclass').size().reset_index(name='Total')
survived_passengers = combined_data[combined_data['Survived'] == 1].groupby('Pclass').size().reset_index(name='Survived')
merged_data = pd.merge(total_passengers, survived_passengers, on='Pclass')


In [None]:
merged_data.head()

In [None]:
merged_data_percentage = merged_data['Survived'] / merged_data['Total'] * 100
merged_data_percentage

In [None]:
sns.barplot(x="Pclass", y="Total", data=merged_data, color='gray')
sns.barplot(x="Pclass", y="Survived", data=merged_data, color="green", label="Survived")

In [None]:
g = sns.catplot(x="Pclass", hue="Sex", col="Survived",
                data=train_data, kind="count",
                height=4, aspect=.7, palette="coolwarm");

In [None]:
combined_data[["Survived", "Pclass"]].groupby(["Pclass"], as_index=False).mean().sort_values(by="Survived", ascending=False)

In [None]:
survival_rates = combined_data.groupby(['Pclass', 'Sex'])['Survived'].agg(['mean', 'count']).reset_index()
survival_rates['Survival Rate'] = survival_rates['mean'] * 100
survival_rates = survival_rates.drop(['mean'], axis=1)
survival_rates.columns = ['Pclass', 'Sex', 'Total Passengers', 'Survival Rate (%)']
survival_rates

In [None]:
combined_data[["Survived", "Sex"]].groupby(["Sex"], as_index=False).mean().sort_values(by="Survived", ascending=False)

In [None]:
combined_data[["Survived", "Embarked"]].groupby(["Embarked"], as_index=False).mean().sort_values(by="Survived", ascending=False)

In [None]:
sns.countplot(x="Survived",hue="Embarked", data=combined_data)
plt.show()

In [None]:
sns.countplot(x="Survived",hue="Parch", data=combined_data)
plt.show()


In [None]:
plt.figure(figsize=(8,8))
combined_data = combined_data.reset_index(drop=True)
sns.histplot(data=combined_data, x="Age", hue="Survived", multiple="stack", kde=True, palette="coolwarm", bins=8, alpha=0.7);

In [None]:
plt.figure(figsize=(15, 15))
combined_data = combined_data.reset_index(drop=True)
sns.histplot(data=combined_data, x="Fare", hue="Survived", multiple="stack", kde=True, palette="coolwarm", bins=8, alpha=0.7);

In [None]:
combined_data["Fare"].max(), combined_data["Fare"].min(),

In [None]:
combined_data["Survived"].groupby(pd.cut(combined_data["Fare"], np.arange(0, 600, 100))).value_counts()

In [None]:
#percentage of survived groupby bin of fare
combined_data["Survived"].groupby(pd.cut(combined_data["Fare"], np.arange(0, 600, 100))).value_counts(normalize=True)

In [None]:
#compare survived and not survived state fare >=80 and fare <80
combined_data_fare101 = combined_data[combined_data["Fare"] >= 80]
combined_data_fare100 = combined_data[combined_data["Fare"] < 80]
combined_data_fare101["Survived"].value_counts(normalize=True)


In [None]:
combined_data_fare100 ["Survived"].value_counts(normalize=True)

In [None]:
sns.countplot(x="Survived",hue="Sex", data=combined_data_fare101)
plt.show()

In [None]:
sns.countplot(x="Survived",hue="Sex", data=combined_data_fare100 )
plt.show()

In [None]:

combined_data["Fare_cat"]=combined_data["Fare"].apply(lambda x: 0 if x < 100 else 1)
#visualize
sns.countplot(x="Survived",hue="Fare_cat", data=combined_data)
plt.show()




In [None]:
#percent survived per sibsp and total survived
combined_data["Survived"].groupby(combined_data["SibSp"]).value_counts(normalize=True)

In [None]:
sns.countplot(x="Survived",hue="SibSp", data=combined_data_fare100)
plt.show()

In [None]:
survival_rates = combined_data.groupby(['SibSp', 'Sex'])['Survived'].agg(['mean', 'count']).reset_index()
survival_rates['Survival Rate'] = survival_rates['mean'] * 100
survival_rates = survival_rates.drop(['mean'], axis=1)
survival_rates.columns = ['SibSp', 'Sex', 'Total Passengers', 'Survival Rate (%)']
survival_rates

In [None]:
'''combined_data['SibSp_cat'] = train_data["SibSp"].apply(lambda x: 'A' if 0 <= x <= 2 else ('B' if 3 <= x <= 4 else 'C'))
sns.countplot(x="Survived",hue="SibSp_cat", data=train_data)
plt.show()'''

In [None]:
#combined_data["SibSp_cat"].isnull().sum()

In [None]:
#combined_data[combined_data["SibSp_cat"] == 'C']


In [None]:
'''survival_rates = combined_data.groupby(['SibSp_cat', 'Sex'])['Survived'].agg(['mean', 'count']).reset_index()
survival_rates['Survival Rate'] = survival_rates['mean'] * 100
survival_rates = survival_rates.drop(['mean'], axis=1)
survival_rates.columns = ['SibSp_cat', 'Sex', 'Total Passengers', 'Survival Rate (%)']
survival_rates'''

In [None]:
''' sns.barplot(x="SibSp_cat", y="Total Passengers", hue="Sex", 
            data=survival_rates, alpha=0.3, palette="coolwarm")

survival_rates['Survived Passengers'] = survival_rates['Total Passengers'] * survival_rates['Survival Rate (%)'] / 100

sns.barplot(x="SibSp_cat", y="Survived Passengers", 
            hue="Sex", data=survival_rates,
            palette="coolwarm")

plt.title('Total and Survived Passengers by SibSp_cat and Sex')
plt.ylabel('Number of Passengers')
plt.show() '''

In [None]:
#type(combined_data[['Name']]), type(combined_data['Name'])

In [None]:
#extract to title from names
#this expression extracts the title (Mr, Mrs, Miss, etc)
combined_data['Title'] = combined_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
combined_data['Title'].unique()
pd.crosstab(combined_data['Title'], combined_data['Sex'])

In [None]:
combined_data['Title'] = combined_data['Title'].replace(['Lady', 'Countess', 'Capt','Col','Don',
                                                 'Dr','Major','Rev', 'Sir','Jonkheer','Dona'],'Rare')
combined_data['Title'] = combined_data['Title'].replace(['Mlle', 'Ms'], 'Miss') 
combined_data['Title'] = combined_data['Title'].replace(['Mme', 'Dona'], 'Mrs')
combined_data[['Title','Survived']].groupby(['Title'], as_index=False).mean()

In [None]:
title_map = {"Mr": 1, "Rare": 2, "Master": 2, "Miss": 4, "Mrs": 4}
combined_data['Title_ord'] = combined_data['Title'].map(title_map)
combined_data['Title_ord']=combined_data['Title_ord'].fillna(0)

In [None]:
combined_data['Sex']=combined_data['Sex'].map({'male':1,'female':0}).astype(int)

In [None]:
combined_data.head()

In [None]:
sns.countplot(x="Survived",hue="Parch", data=combined_data)

In [None]:
combined_data[['Parch','Survived']].groupby(['Parch'], as_index=False).mean()


In [None]:
#Parch 1-2-3 --> 3, 4,6,7->1, 0-5 --> 2 mapping
combined_data['Parch_cat']=combined_data['Parch'].map({0:2, 5:2, 4:1, 6:1, 7:1, 1:3, 2:3, 3:3, 9:1})
combined_data.isnull().sum()
#combined_data['Parch_cat'].unique()

In [None]:
combined_data[['Embarked','Survived']].groupby(['Embarked'], as_index=False).mean()

In [None]:
combined_data['Embarked']=combined_data['Embarked'].fillna('Q')
combined_data['Embarked_ord']=combined_data['Embarked'].map({'S':1,'C':2,'Q':1}).astype(int)

In [None]:
combined_data['Sex']

In [None]:
combined_data['Fare'].fillna(1,inplace=True)

In [None]:
combined_data.isnull().sum()

In [None]:
combined_data.loc[ combined_data['Age'] <= 16, 'Age'] = 0
combined_data.loc[(combined_data['Age'] > 16) & (combined_data['Age'] <= 32), 'Age'] = 1
combined_data.loc[(combined_data['Age'] > 32) & (combined_data['Age'] <= 48), 'Age'] = 2
combined_data.loc[(combined_data['Age'] > 48) & (combined_data['Age'] <= 64), 'Age'] = 3
combined_data.loc[ combined_data['Age'] > 64, 'Age']
combined_data.head()

In [None]:
df=combined_data.drop(['PassengerId','Name',  'Ticket', 'Fare', 'Cabin', 'Embarked','Title'], axis=1)
df.head()

In [None]:
df['Age'].fillna(df['Age'].mean(), inplace=True)

In [None]:
df.isnull().sum()

In [None]:
train_data = df[:891]
test_data = df[891:]

In [None]:
train_data.shape, test_data.shape

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.isnull().sum()

In [None]:
#import Standart scaler
from sklearn.preprocessing import StandardScaler
X_train = StandardScaler().fit_transform(train_data.drop("Survived", axis=1)) #standardize train_data
y_train = train_data["Survived"]
y_test = test_data["Survived"]

X_test = StandardScaler().fit_transform(test_data.drop("Survived", axis=1))

# Modelling

In [None]:
from  sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
#accuaracy
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
def model_classification(X,y):
    '''
    X: independent variable
    y: dependent variable
    return best model and its accuracy
    '''
    x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
    models = [GaussianNB(),BernoulliNB(),LogisticRegression(),RandomForestClassifier(), GradientBoostingClassifier(), KNeighborsClassifier(n_neighbors=5)]
    results=[]
    for model in models:
        model.fit(x_train,y_train)
        model_predict=model.predict(x_test)
        print("Model: ",model)
        print("Model Accuracy: ",accuracy_score(model_predict,y_test))
        print("Model Confusion Matrix: ",confusion_matrix(model_predict,y_test),"\n")
        print("Model Classification Report: ",classification_report(model_predict,y_test))
        print("-"*50)
        results.append(accuracy_score(model_predict,y_test))
    #best model
    best_model=models[results.index(max(results))]
    print("Best Model: ",best_model)
    print("Best Model Accuracy: ",max(results))
    models=pd.DataFrame({
        'Model':['GaussianNB','BernoulliNB','LogisticRegression','RandomForestClassifier', 'GradientBoostingClassifier', 'KNeighborsClassifier'],
        'Score':results})
    print(models.sort_values(by='Score', ascending=False, ignore_index=True))
    return best_model,max(results), confusion_matrix(model_predict,y_test)

In [None]:
model_classification(X_train,y_train)

In [None]:
#whole data
rf_model = RandomForestClassifier()
rf_model.fit(X_train,y_train)

y_pred=rf_model.predict(X_test)
aac = round(rf_model.score(X_train, y_train) * 100, 2) #accuracy_score(X_train, y_train)
print(aac)


In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
y_pred=rf_model.predict(X_test)
submission = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": y_pred.astype(int)
    })
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
#download submission
from IPython.display import FileLink
FileLink('submission.csv')