# Titanic - Machine Learning from Disaster

<img src = "https://mollybrown.org/wp-content/uploads/2020/04/Titanic-Drawing_shutterstock_783936175_resized-1.jpg" >

# Importing the Libraries

In [None]:
# import library

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import plotly.express as px 

import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv("/kaggle/input/titanic/train.csv") 
train

In [None]:
gender_data = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
gender_data

In [None]:
test = pd.read_csv("/kaggle/input/titanic/test.csv")
test.head() 

In [None]:
train.head()

# The meaning of each attribute is the following:

- PassengerId: the ID given to each passenger,
- Survived: the target attribute (1 for passengers who survived, 0 for those who didn't),
- Pclass: Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd class),
- Name, Sex, Age: self-explanatory,
- SibSp: Number of siblings & spouses aboard the Titanic,
- Parch: Number of parents & children aboard the Titanic,
- Ticket: Ticket number,
- Fare: Passenger fare (in pounds),
- Cabin: Passenger's cabin number, and
- Embarked: Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton).
- 'PassengerId' is unique to each passenge

In [None]:
train.shape

In [None]:
train.info()

In [None]:
train.columns 

# See the Null Values in the Data. 

In [None]:
train.isnull().sum()

In [None]:
import missingno as msno
msno.matrix(train) 

- We can see there are only 3 columns that have Null values, Highest Null values are in the Age Column. 

# Statistical Summary of the Data. 

In [None]:
train.describe()

In [None]:
train.describe().T

- There is a column named as Survived which have 0 or 1 values, So 0 for those who didn't survived and 1 for those who servived.

# Let's see the Servived Column 

In [None]:
train['Survived'].isnull().sum()          # There are no Null Values in the Column. 

In [None]:
train['Survived'].value_counts()

In [None]:
sns.countplot(train['Survived'], data = train) 

- We want to know male and female are Survived.  

In [None]:
sns.countplot(train['Survived'], hue = 'Sex', data = train, palette = 'winter') 

In [None]:
women = train.loc[train.Sex == 'female']["Survived"] 
rate_women = sum(women)/len(women)
print("women who survived:", round(rate_women,4)*100 , "%")   

In [None]:
men = train.loc[train.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)
print("% of men who survived:", round(rate_men,2)*100 , "%")

In [None]:
fig = px.scatter_3d(train, x='Name', y='Sex', z='Age', color='Age')
fig.show()

- Male are died in this accident and More Female are survived. 

# Age Column 

In [None]:
# filling the null values with its mean


train['Age'].fillna(train['Age'].mean(),inplace=True)
train['Age']  

In [None]:
train['Age'].isnull().sum()          # There are no Null Values in the Column. 

# We can see that there is min value in the Fare is 0.00. So It could be 0 or not.



In [None]:
train['Fare'] = train['Fare'].replace(0, train['Fare'].mean())  
train['Fare'] 

In [None]:
train['Embarked'].replace('nan',np.nan,inplace=True)
train['Embarked'].fillna(train['Embarked'].mode()[0],inplace=True) 

train['Embarked'].value_counts()

In [None]:
train.drop(['Name','Ticket','Cabin'],axis=1,inplace=True)  

In [None]:
train

In [None]:
# Change these values in the Numeric Format. 


train['Sex']=train['Sex'].map({'male':0,'female':1})
train['Sex'] 

In [None]:
train['Sex'].value_counts()

In [None]:
# We would have to convert also these values to the Numerical Data.

train['Embarked']=train['Embarked'].map({'S':0,'C':1,'Q':2}) 

In [None]:
train['Embarked'].unique() 

In [None]:
train['Embarked'].value_counts()

In [None]:
train

In [None]:
sns.barplot(x='Pclass', y='Survived', data=train);

In [None]:
plt.rc('xtick', labelsize=14) 
plt.rc('ytick', labelsize=14)

plt.figure()
fig = train.groupby('Survived')['Pclass'].plot.hist(histtype= 'bar', alpha = 0.8) 
plt.legend(('Died','Survived'), fontsize = 12)
plt.xlabel('Pclass', fontsize = 18)
plt.show()

- We can see that passengers are 1st Class ticket holders are survived more in it and the mostly death are reported in the 3rd Class ticket holders section.  

In [None]:
sns.set(style="darkgrid")
sns.countplot( x='Survived', data=train, hue="Embarked", palette="Set1");

- Embarked: Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton).

- More people from the Southampton were died and servived from it also more. 

- We can see that the passengers who died and who are survived with the PassengerId 

In [None]:
# Correlation matrix between numerical values (SibSp Parch Age and Fare values) and Survived 


g = sns.heatmap(train[["Survived","SibSp","Parch","Age","Fare"]].corr(),annot=True, fmt = ".2f", cmap = "coolwarm")

In [None]:
g = sns.catplot(x="SibSp",y="Survived",data=train,kind="bar", height = 6, palette="vlag" )
g.despine(left=True)
g = g.set_ylabels("survival probability")

- Single passengers (0 SibSP) or passengers having 1-2 relatives on board (SibSP 1 or 2) have more chance to survive while it seems that passengers having a lot of siblings/spouses have less chance to survive.

In [None]:
# Explore Parch feature vs Survived
g  = sns.catplot(x="Parch",y="Survived",data=train,kind="bar", height = 6 , 
palette = "pastel")
g.despine(left=True)
g = g.set_ylabels("survival probability")

- Small families (Parch 1,2) have more chance to survive when compared to single (parch 0), medium(parch 3,4) and large family(parch 5)

- NOTE: There is an important standard deviation in the survival of passengers with 3 parents/children

In [None]:
# Explore Age distibution 
g = sns.kdeplot(train["Age"][(train["Survived"] == 0) & (train["Age"].notnull())], color="Red", shade = True)
g = sns.kdeplot(train["Age"][(train["Survived"] == 1) & (train["Age"].notnull())], ax =g, color="Blue", shade= True)
g.set_xlabel("Age")
g.set_ylabel("Frequency")
g = g.legend(["Not Survived","Survived"])

- When we superimpose the two densities , we cleary see a peak correponsing (between 0 and 5) to babies and very young childrens.

# Let's Work on Test Data

In [None]:
test

In [None]:
test.columns

In [None]:
test.shape

In [None]:
test.head()

In [None]:
test.drop(['Name','Ticket','Cabin'],axis=1,inplace=True)

In [None]:
test

In [None]:
test['Age'].fillna(test['Age'].mean(),inplace=True)
test['Age']

In [None]:
test['Fare'] = test['Fare'].replace(0, test['Fare'].mean())  
test['Fare'] 

In [None]:
# We would have to convert also these values to the Numerical Data.

test['Embarked']=test['Embarked'].map({'S':0,'C':1,'Q':2}) 

In [None]:
test

In [None]:
# Change these values in the Numeric Format. 


test['Sex']=test['Sex'].map({'male':0,'female':1})
test['Sex'] 

In [None]:
test

In [None]:
x=train.drop(['Survived'],axis=1)
y=train['Survived'] 

In [None]:
#train_test_splitting of the dataset
from sklearn.model_selection import train_test_split 
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=0)

In [None]:
# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(x_train,y_train) 

In [None]:
#hyperparameter tuning of logistic regression


from sklearn.model_selection import GridSearchCV
parameters = {
         'penalty':['l1','l2'],
         'C':[0.001, 0.01, 0.1, 1, 10, 20,100, 1000]
}
lr= LogisticRegression(penalty='l1')
cv=GridSearchCV(log_reg,parameters,cv=5,n_jobs=-1)
cv.fit(x_train,y_train)
cv.predict(x_test)

In [None]:
 #best_parameters
print("Best CV params", cv.best_params_) 

In [None]:
 #best_score
print("Best CV score", cv.best_score_)

In [None]:
prob = log_reg.predict_proba(x_train)
print("Maximum predicted probability",np.max(prob))

In [None]:
cv.predict(x_test) 

In [None]:
finals_predictions = cv.predict(x_test) 

finals_predictions

In [None]:
predictionsss = pd.DataFrame(finals_predictions)

subs_dfgs = pd.read_csv('/kaggle/input/titanic/gender_submission.csv') 

subs_dfgs['Survived'] = predictionsss

subs_dfgs.to_csv('Submissionsd2s.csv', index = False)
