In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings(action='ignore')

**Variable	Definition	Key**

* **survival**	Survival	0 = No, 1 = Yes
* **pclass**	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
* **sex**	Sex	
* **Age**	Age in years	
* **sibsp**	# of siblings / spouses aboard the Titanic	
* **parch**	# of parents / children aboard the Titanic	
* **ticket**	Ticket number	
* **fare**	Passenger fare	
* **cabin**	Cabin number	
* **embarked**	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

# Data

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# 1 - EDA

In [None]:
# Reading train data
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')

# Reading test data
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
train_data.info()

In [None]:
train_data.head()

In [None]:
train_data.shape

In [None]:
train_data.describe()

In [None]:
train_median = pd.DataFrame(train_data.median()).reset_index().rename(columns = {0: 'Median',
                                                                                'index': 'columns_names'})

train_median

In [None]:
# Checking number of null values in the dataset... output as DF.

train_nulls = pd.DataFrame(train_data.isna().sum()).reset_index().rename(columns = {0: 'total_nulls', 'index': 'columns_names'})

train_nulls['percent_nulls'] = pd.Series()

for i in train_nulls.index:
    train_nulls.percent_nulls[i] = (train_data[train_nulls['columns_names'][i]].isna().sum()/train_data.PassengerId.count())*100
    
train_nulls.sort_values('total_nulls')

# 1.1 - DataViz

**Here I present some questions to better understand what may have influenced the survival rate...**

In [None]:
data_hist = train_data[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]

data_hist.hist(figsize=(10,8))
plt.show()

In [None]:
train_data['Sex'].value_counts()

**What is the relationship between survival and gender(sex)?**

In [None]:
train_data['Died'] = 1-train_data['Survived']

train_data.groupby('Sex').agg('sum')[['Survived', 'Died']].plot(kind='bar',
                                                               figsize=(6,4))

In [None]:
train_data['Pclass'].value_counts()

**Does class influence survival?**

In [None]:
fig = plt.figure(figsize=(6,4))

plt.hist([train_data[train_data['Survived']==1]['Pclass'],
         train_data[train_data['Survived']==0]['Pclass']],
         bins=12, label=['Survived', 'Died'])

plt.xlabel('Pclass')
plt.ylabel('Passengers')
plt.legend()

In [None]:
train_data['SibSp'].value_counts()

**Did travelling with other family members contribute to the rate of non-survivors?**

In [None]:
fig = plt.figure(figsize=(6,4))

plt.hist([train_data[train_data['Survived']==1]['SibSp'],
         train_data[train_data['Survived']==0]['SibSp']],
        bins=15, label=['Survived', 'Died'])

plt.xlabel('SibSp')
plt.ylabel('Passengers')
plt.legend()

**How has the fare affected the survival rate?**

In [None]:
fig = plt.figure(figsize=(6,4))

plt.hist([train_data[train_data['Survived']==1]['Fare'],
         train_data[train_data['Survived']==0]['Fare']],
        bins=15, label=['Survived', 'Died'])

plt.xlabel('Fare')
plt.ylabel('Passengers')
plt.legend()

# 2 - Feature Engineering

In [None]:
train_data_fe = train_data.dropna(subset=['Embarked'])

train_data_fe.isna().sum()

In [None]:
# Converting categorical into numerical

train_data_fe.Sex=train_data_fe.Sex.map({'female':0,'male':1})

train_data_fe.Embarked=train_data_fe.Embarked.map({'S':0,'C':1,'Q':2})

In [None]:
# Median age for each sex
md_age_male = train_data_fe[train_data_fe['Sex']==1]['Age'].mean()
md_age_famale = train_data_fe[train_data_fe['Sex']==0]['Age'].mean()

# Filling null values with respective median age
train_data_fe.loc[(train_data_fe.Age.isnull()) & (train_data_fe['Sex']==0),'Age']=md_age_famale
train_data_fe.loc[(train_data_fe.Age.isnull()) & (train_data_fe['Sex']==1),'Age']=md_age_male

train_data_fe.isna().sum()

In [None]:
train_data_fe.Age = train_data_fe.Age.fillna(train_data_fe.Age.mean())

train_data_fe.isna().sum()

In [None]:
train_data_fe.head()

In [None]:
train_data_fe.info()

In [None]:
X = train_data_fe[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
Y = train_data_fe['Survived']

In [None]:
X.isna().sum()

# 3 - Modeling and Avaluetion

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.5, random_state=95)

In [None]:
x_train.isna().sum()

**Model 1 - LR**

In [None]:
from sklearn.linear_model import LogisticRegression as LR

LR_model = LR()
LR_model.fit(x_train,y_train)

y_predict = LR_model.predict(x_test)

from sklearn.metrics import accuracy_score, f1_score

print('Accuracy of the LR model was: ' + str(accuracy_score(y_test,y_predict)))
print('F1_score of the LR model was: ' + str(f1_score(y_test,y_predict)))

**Model 2 - RFC**

In [None]:
from sklearn.ensemble import RandomForestClassifier as RFC

RFC_model = RFC(random_state=1)
RFC_model.fit(x_train,y_train)

y_predict_2 = RFC_model.predict(x_test)

print('Accuracy of the RFC model was: ' + str(accuracy_score(y_test,y_predict_2)))
print('F1_score of the RFC model was: ' + str(f1_score(y_test,y_predict_2)))

**Model 3 - GBC**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier as GBC

GBC_model = GBC() # learning_rate= 0.02 , max_depth= 3 , n_estimators= 50,min_samples_split=3

GBC_model.fit(x_train,y_train)

y_predict_3 = GBC_model.predict(x_test)

print('Accuracy of the LR model was: ' + str(accuracy_score(y_test,y_predict_3)))
print('F1_score of the LR model was: ' + str(f1_score(y_test,y_predict_3)))

# 4 - Test_data and Predict

In [None]:
test_data.head()

In [None]:
test_data.info()

In [None]:
test_data.Sex = test_data.Sex.map({'female':0, 'male':1})

test_data.Embarked = test_data.Embarked.map({'S':0,'C':1,'Q':2})

In [None]:
test_data.head()

In [None]:
test_data_fe = test_data[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]

test_data_fe

In [None]:
test_data_fe.info()

In [None]:
test_data_fe.Age = test_data_fe.Age.fillna(test_data.Age.mean())

test_data_fe.Fare = test_data_fe.Fare.fillna(test_data.Fare.median())

test_data_fe.isna().sum()

In [None]:
predictions = LR_model.predict(test_data_fe)

predictions

# Saving model 1

In [None]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId,'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")