In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
gender_submission = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
gender_submission.head()

In [None]:
## saving passenger id  in order to submit later. 
passengerid = test.PassengerId
all_data = pd.concat([train,test],axis = 0).reset_index(drop = True)

In [None]:
all_data.head()

In [None]:
sns.countplot(x = 'Survived' , data = all_data)

**1. DEALING WITH MISSING VALUES**

In [None]:
all_data.isnull().sum().sort_values(ascending = False)

**Embarked**

In [None]:
train['Embarked'].value_counts()

In [None]:
all_data['Embarked'] = all_data['Embarked'].fillna('C')

In [None]:
all_data[all_data['Embarked'].isnull()]

**Fare**

In [None]:
all_data['Fare'].hist(bins = 40 )

In [None]:
all_data["Fare"] = all_data["Fare"].fillna(np.mean(all_data[all_data["Pclass"] == 3]["Fare"]))

In [None]:
all_data[all_data["Fare"].isnull()]


**Age**

In [None]:
plt.figure(figsize = (10,7))
sns.boxplot(x = 'Pclass' , y = 'Age' , data = all_data)

 We can see that.....when we seperate by PClass , the wealthier passengers in the first and second class tend to be a little older than passengers in the third class . 
 We can use the **average age values** to impute the age.......based on Pclass

In [None]:
def impute_age (cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age
        
        

In [None]:
all_data['Age'] = all_data[['Age','Pclass']].apply(impute_age , axis = 1)

In [None]:
all_data[all_data["Age"].isnull()]


**Cabin**

In [None]:
all_data.Cabin.fillna('U',inplace = True)

In [None]:
all_data.Cabin = [i[0] for i in all_data.Cabin]

In [None]:
all_data.groupby('Cabin')['Fare'].mean().sort_values()

In [None]:
def cabin_solving(i):
    """Grouping cabin feature by the first letter"""
    a = 0
    if i<16:
        a = "G"
    elif i>=16 and i<27:
        a = "F"
    elif i>=27 and i<38:
        a = "T"
    elif i>=38 and i<47:
        a = "A"
    elif i>= 47 and i<53:
        a = "E"
    elif i>= 53 and i<54:
        a = "D"
    elif i>=54 and i<116:
        a = 'C'
    else:
        a = "B"
    return a
   

In [None]:
with_N = all_data[all_data.Cabin == "U"]

without_N = all_data[all_data.Cabin != "U"]
##applying cabin estimator function. 
with_N['Cabin'] = with_N.Fare.apply(lambda x: cabin_solving(x))


In [None]:
all_data[all_data['Cabin'].isnull()]

In [None]:
all_data.isnull().sum().sort_values(ascending = False)

In [None]:
## getting back train. 
all_data = pd.concat([with_N, without_N], axis=0)

## PassengerId helps us separate train and test. 
all_data.sort_values(by = 'PassengerId', inplace=True)

## Separating train and test from all_data. 
train = all_data[:891]

test = all_data[891:]




**2. DATA VISUALISATION**

2a. UNIVARIATE ANALYSIS

* Categorical Variable: Survived, Sex, Pclass, Embarked, Cabin, Name, Ticket, Sibsp and Parch
* Numerical Variable: Fare, age and passengerId


In [None]:
def bar_plot(variable):
    """
        input: variable ex: "Sex"
        output: bar plot & value count
    """
    # get feature
    var = train[variable]
    # count number of categorical variables
    varValue = var.value_counts()
    
    # visualize
    plt.figure(figsize = (9,3))
    plt.bar(varValue.index, varValue)
    plt.xticks(varValue.index, varValue.index.values)
    plt.ylabel("Frequency")
    plt.title(variable)
    plt.show()
    print("{}: \n {}".format(variable,varValue))
 

In [None]:
category1 = ["Survived","Sex","Pclass","Embarked","SibSp", "Parch"]
for c in category1:
    bar_plot(c)

In [None]:
def histogram(variable):
    var = train[variable]
    plt.figure(figsize = (9,3))
    fig = train[variable].hist(bins = 10)
    fig.set_xlabel(variable)
    fig.set_ylabel('Survived')

In [None]:
variables = ['Age' , 'Fare']
for c in variables:
    histogram(c)

**2b. BIVARIATE ANALYSIS**

* **Gender and Survived**

In [None]:
sns.set(style="darkgrid")
plt.subplots(figsize = (15,8))
sns.countplot(x = 'Survived' , hue = 'Sex' , data = all_data)
plt.title("Survived/Non-Survived Passenger Gender Distribution", fontsize = 25,loc = 'center', pad = 40)
plt.ylabel("% of passenger survived", fontsize = 15, )
plt.xlabel("Sex",fontsize = 15);

* **PClass and Survived**

In [None]:
sns.set(style="darkgrid")
plt.subplots(figsize = (15,8))
sns.countplot(x = 'Survived' , hue = 'Pclass' , data = all_data)
plt.title("Survived/Non-Survived Passenger Pclass Distribution", fontsize = 25,loc = 'center', pad = 40)
plt.ylabel("% of passenger survived", fontsize = 15, )
plt.xlabel("Pclass",fontsize = 15);

* **Fare and Survived**

In [None]:
fig = plt.figure(figsize=(15,8),)
ax=sns.kdeplot(train.loc[(train['Survived'] == 0),'Fare'] , color='blue',label='not survived')
ax=sns.kdeplot(train.loc[(train['Survived'] == 1),'Fare'] , color='g', label='survived')
plt.title('Fare Distribution Survived vs Non Survived', fontsize = 25, pad = 40)
plt.ylabel("Frequency of Passenger Survived", fontsize = 15, labelpad = 20)
plt.xlabel("Fare", fontsize = 15, labelpad = 20);

* **Age and Survived**

In [None]:
fig = plt.figure(figsize=(15,8),)
ax=sns.kdeplot(train.loc[(train['Survived'] == 0),'Age'] , color='blue',label='not survived')
ax=sns.kdeplot(train.loc[(train['Survived'] == 1),'Age'] , color='g', label='survived')
plt.title('Age Distribution - Survivors V.S. Non Survivors', fontsize = 25, pad = 40)
plt.xlabel("Age", fontsize = 15, labelpad = 20)
plt.ylabel('Frequency', fontsize = 15, labelpad= 20);

**3. FEATURE ENGINEERING**

**title**

Getting the title of each name as a new feature

In [None]:
all_data['title']=all_data.Name.apply(lambda x: x.split('.')[0].split(',')[1].strip())

In [None]:
newtitles={
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royalty",
    "Dona":       "Royalty",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royalty"}
all_data['title']=all_data.title.map(newtitles)

In [None]:
all_data['title'].unique()

In [None]:
sns.countplot( x = 'title' , data = all_data)

In [None]:
all_data.drop(['Name'], axis = 1 , inplace = True)

**Ticket**

In [None]:
all_data['Ticket'].value_counts().sort_values(ascending = False)

In [None]:
all_data.drop(['Ticket'] , axis = 1,inplace = True)

**Family_size**

In [None]:
all_data["Family_size"] = all_data["Parch"] + all_data["SibSp"] + 1
all_data['Family_size'].head()

In [None]:
sns.countplot('Family_size', data = all_data)

In [None]:
all_data.drop(['SibSp','Parch'] , axis = 1, inplace = True)

**PassengerId**

In [None]:
all_data.drop(['PassengerId'], axis = 1,inplace = True)

**Encoding of categorical variables**

In [None]:
all_data['Sex'] = all_data.Sex.map({'male' : 0 , 'female' : 1})

In [None]:
converting_features = ["Embarked", "Pclass", "Cabin", "title"]
all_data = pd.get_dummies(all_data, columns=converting_features, prefix=converting_features)
all_data.head()

In [None]:
cols = all_data.columns.tolist()
print(cols)

In [None]:
train = all_data[:len(train)]
test = all_data[len(train):]

**Seperating dependant and independent variables**

In [None]:
y = train['Survived']
X_train = train.drop('Survived', axis = 1)
test = test.drop('Survived', axis = 1)
print(X_train.shape)
print(test.shape)

**Splitting the training data**

In [None]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X_train , y , test_size = 0.33, random_state = 0)


In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape


**4. FEATURE SCALING**

*Before Scaling*

In [None]:
headers = X_train.columns 
X_train.head()

In [None]:
from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler()
X_train = std_scale.fit_transform(X_train)
test = std_scale.fit_transform(test)


*After Scaling*

In [None]:
pd.DataFrame(X_train , columns = headers).head()

**5. MODELLING  THE  DATA**

In [None]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()

In [None]:
logmodel.fit(X_train, y_train)
acc_log_train = round(logmodel.score(X_train, y_train)*100,2) 
acc_log_test = round(logmodel.score(X_test,y_test)*100,2)
print("Training Accuracy: % {}".format(acc_log_train))
print("Testing Accuracy: % {}".format(acc_log_test))


In [None]:
prediction = logmodel.predict(X_test)

In [None]:
prediction

**Confusion Matrix**

In [None]:
from sklearn.metrics import confusion_matrix
prediction = logmodel.predict(X_test)
cm = confusion_matrix(y_test, prediction)
cm


In [None]:
from sklearn.model_selection import cross_val_score

**Logistic Regression**

In [None]:
lr = LogisticRegression(max_iter = 2000)
cv = cross_val_score(lr , X_train , y_train , cv = 5)
print(cv)
print(cv.mean())

**Random Forest** 

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 1)
cv = cross_val_score(rf , X_train , y_train , cv = 5)
print(cv)
print(cv.mean())


**Support Vector Classifier**

In [None]:
from sklearn.svm import SVC
svc = SVC(probability = True)
cv = cross_val_score(svc , X_train , y_train , cv = 5)
print(cv)
print(cv.mean())

**Voting Classifiers**

In [None]:
from sklearn.ensemble import VotingClassifier 
voting = VotingClassifier(estimators = [('LogisticRegression' , lr) , ('RandomForest' , rf) , ('Support Vector Classifier' , svc)] ,  voting = 'soft')
cv = cross_val_score(voting , X_train , y_train , cv = 5)
print(cv)
print(cv.mean())

In [None]:
voting.fit(X_train , y_train)

In [None]:
from sklearn.metrics import accuracy_score
voting_accy = round(accuracy_score(voting.predict(X_test) , y_test), 3)
print(voting_accy)

**SUBMITTING THE TEST PREDICTIONS**

In [None]:
predictions = voting.predict(test)
output = pd.DataFrame({'PassengerId': passengerid , 'Survived': predictions })
output.PassengerId = output.PassengerId.astype(int)
output.Survived = output.Survived.astype(int)
output.to_csv('My_Submission.csv', index=False)
output.head()