In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

In [2]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
df_train = pd.read_csv("/kaggle/input/titanic/train.csv")
df_test = pd.read_csv("/kaggle/input/titanic/test.csv")
df_train.head()

# Data exploration

Checking first thoughts

In [4]:
plt.hist(df_train['Fare'])
plt.show()
plt.hist(df_train['Age'])
plt.show()

In [5]:
survivors_age = df_train.where(df_train['Survived']== 1).dropna(subset=["Survived","Age"])
died_age = df_train.where(df_train['Survived']== 0).dropna(subset=["Survived","Age"])
died_age.dropna(subset=["Survived","Age"])

In [6]:
plt.hist(survivors_age['Age'], label="Survived")
plt.legend()
plt.show()
plt.hist(died_age['Age'], label="Died", color="orange")
plt.legend()
plt.show()

In [7]:
import seaborn as sns

age_survivors = df_train[["Age","Survived"]].dropna(how="all")
print(age_survivors.head())
corr = age_survivors.corr()
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns,
           annot=True)

At this point, we can see that there is not much correlation between age and surviving. 

In [8]:
core_df = df_train.drop(["Name","Cabin","Ticket","PassengerId"],axis=1)
print(core_df.head())
print(core_df.shape)

In [9]:
core_df.isnull().sum()

In [10]:
core_df.describe()

In [11]:
sns.heatmap(data=core_df.corr(),annot=True)

After plotting heatmap for all, we can see that Fare and Survived columns are the most correlated ones. 

Next notable one is Parch and Survived

In [12]:
plt.figure(figsize=(18,6))
sns.boxplot(x=core_df['Fare'])
plt.show()

In [13]:
plt.hist(core_df["Pclass"], histtype='bar')

# Data manipulation

In [14]:
clean_df = core_df.dropna()
clean_df.shape

In [15]:
clean_df.loc[clean_df['Sex'] == 'female','Sex'] = 0
clean_df.loc[clean_df['Sex'] == 'male','Sex'] = 1
# clean_df['Sex']=clean_df['Sex'].astype(str).astype(float)
clean_df['Sex']=clean_df['Sex'].astype(float)

clean_df.loc[clean_df['Embarked'] == 'S', 'Embarked'] = 1
clean_df.loc[clean_df['Embarked'] == 'Q', 'Embarked'] = 2
clean_df.loc[clean_df['Embarked'] == 'C', 'Embarked'] = 3
clean_df['Embarked'] = clean_df['Embarked'].astype(float)
clean_df.head()

Same for testing data

In [26]:
clean_test = df_test.drop(["Name","Cabin","Ticket"], axis=1)
clean_test.head()

In [27]:
clean_test.loc[clean_test['Sex'] == 'female','Sex'] = 0
clean_test.loc[clean_test['Sex'] == 'male','Sex'] = 1
# clean_test['Sex']=clean_test['Sex'].astype(str).astype(float)
clean_test['Sex']=clean_test['Sex'].astype(float)

clean_test.loc[clean_test['Embarked'] == 'S', 'Embarked'] = 1
clean_test.loc[clean_test['Embarked'] == 'Q', 'Embarked'] = 2
clean_test.loc[clean_test['Embarked'] == 'C', 'Embarked'] = 3
clean_test['Embarked'] = clean_test['Embarked'].astype(float)
clean_test.head()

In [28]:
clean_test['Age'] = clean_test['Age'].fillna(clean_test['Age'].mean())
clean_test['Fare'] = clean_test['Fare'].fillna(clean_test['Fare'].mean())
clean_test.isnull().sum()

In [29]:
y_train = clean_df['Survived']
x_train = clean_df.drop('Survived', axis=1)
x_test = clean_test

In [19]:
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [30]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(clean_df.drop('Survived', axis=1), clean_df['Survived'], test_size=0.3)

In [35]:
clf_names = [
    "knn", "Decision Tree", "MLP","SupportVector","GaussianNB"
]

classifiers = [
    KNeighborsClassifier(n_neighbors=25),
    DecisionTreeClassifier(),
    MLPClassifier(max_iter=1000, solver='adam', random_state=1),
    SVC(kernel="linear"),
    GaussianNB()
]

In [36]:
ress = []
for ind, clf in enumerate(classifiers):
    clf.fit(x_train,y_train)
    y_pred = clf.predict(x_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    ress.append([clf_names[ind],accuracy])

ress_df = pd.DataFrame(ress, columns =['Classifier','Accuracy'])
ress_df

# Running on Test data

In [37]:
# clf = MLPClassifier(max_iter=500, solver='adam', random_state=1)
clf = SVC(kernel="linear")
clf.fit(x_train,y_train)
y_pred = clf.predict(clean_test.drop("PassengerId",axis=1))
output = pd.DataFrame({'PassengerId':clean_test['PassengerId'], 'Survived': y_pred})
output.to_csv('submission.csv',index=False)

# Experimenting ground

In [28]:
for i in range(1,40,2):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train,y_train)
    y_pred = knn.predict(x_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print(i, accuracy)

In [68]:
trnn = DecisionTreeClassifier()
trnn.fit(x_train,y_train)
y_pred = trnn.predict(x_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print(accuracy)

In [72]:
mlpnn = MLPClassifier(max_iter=500, solver='adam', random_state=1)
mlpnn.fit(x_train, y_train)
y_pred = mlpnn.predict(x_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print(accuracy)