In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
submission = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

In [3]:
train.head()

In [4]:
train.dtypes

# Age impute statistics

In [5]:
train.groupby(['Survived','Pclass','Sex']).agg({'Age':'mean'})

# Correlation statistics

In [6]:
corr = train.corr(method ='pearson')
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, ax=ax)

In [7]:
sns.heatmap(train.isnull(), cbar=False)

# Imputing age as per statistics

In [8]:
train['Age'] = train['Age'].fillna(train.groupby(['Survived','Pclass','Sex'])['Age'].transform('mean'))
sns.heatmap(train.isnull(), cbar=False)

In [9]:
sns.heatmap(test.isnull(), cbar=False)

In [10]:
test['Age'] = test['Age'].fillna(train.groupby(['Pclass','Sex'])['Age'].transform('mean'))
sns.heatmap(test.isnull(), cbar=False)

# Droping unnecessary clumns from both dataset

In [11]:
train.drop(['Cabin'], axis = 1, inplace=True)
train.drop(['Ticket'], axis = 1, inplace=True)
train.drop(['Name'], axis = 1, inplace=True)
train.drop(['PassengerId'], axis = 1, inplace=True)
test.drop(['Cabin'], axis = 1, inplace=True)
test.drop(['Ticket'], axis = 1, inplace=True)
test.drop(['Name'], axis = 1, inplace=True)
test_Id  = test.PassengerId
test.drop(['PassengerId'], axis = 1, inplace=True)

# Mode of Embarked column(to impute training data-set)

In [12]:
train.Embarked.mode()

In [13]:
# train[train.isnull().any(axis=1)] # to check row with NaN value
train['Embarked'].fillna('S', inplace=True) # imputing with 'S'

# Fare of passanger based on Pclass(For imputing on testing data)

In [14]:
train.groupby(['Pclass']).agg({'Fare':'mean'})

In [15]:
# test[test.isnull().any(axis=1)] # to check row with NaN value
test['Fare'].fillna(13.675, inplace=True) # imputing with '13.675'

# Changing sex from text to number

In [16]:
train['Sex'].replace({'male':1, 'female':0}, inplace=True)
test['Sex'].replace({'male':1, 'female':0}, inplace=True)

# One hot encoding for Embarked column

In [17]:
# training data
one_hot_embarked = pd.get_dummies(train['Embarked'], drop_first=True)
train = train.drop('Embarked',axis = 1)
train = train.join(one_hot_embarked)
one_hot_embarked.head()
# testing data
one_hot_embarked = pd.get_dummies(test['Embarked'], drop_first=True)
test = test.drop('Embarked',axis = 1)
test = test.join(one_hot_embarked)
one_hot_embarked.head()

# Removing survived column

In [18]:
y = train['Survived']
X = train.drop(['Survived'], axis = 1)

# Data alignment

In [19]:
X, test = X.align(test, join='left', axis=1)

# Data splitting

In [20]:
x_train, x_val, y_train, y_val = train_test_split(X, y, random_state=0, test_size=0.20)

# Data pre-processing is done
Training Data
* **x_train**
* **x_val**
* **y_train**
* **y_val**

Submission Data
* **test**

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error

lr = LogisticRegression(solver='liblinear',class_weight='balanced')
lr.fit(x_train,y_train)
y_predict = lr.predict(x_val)

mean_squared_error(y_val,y_predict)

# Submission

In [22]:
submission.head()

In [23]:
submission_pred = lr.predict(test)

In [24]:
output = pd.DataFrame({'PassengerId': test_Id, 'Survived': submission_pred})
output.to_csv('submission.csv', index=False)