In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing the libraries

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

## Importing the datasets

In [3]:
df_train = pd.read_csv('../input/titanic/train.csv')
df_test = pd.read_csv('../input/titanic/test.csv')

## Merging the training and test Data

In [4]:
df = df_train.append(df_test)

## Exploratory Data Analysis

In [5]:
df.head()

In [6]:
df.columns

In [7]:
df.info()

### Checking correlation between variables

In [8]:
sns.heatmap(df.corr(),annot=True,cmap='viridis')

### Distribution of Age onboard

In [9]:
sns.distplot(df['Age'],kde=False,bins=40,color='blue')

### Number of people who boarded from different places

In [11]:
place_names = df['Embarked'].value_counts().index
place_values = df['Embarked'].value_counts().values
plt.pie(place_values,labels=place_names,autopct='%1.2f%%')

### Number of Males and females onboard

In [12]:
sns.countplot(x='Sex',data=df)

### Number of people survived based on sex

In [13]:
sns.countplot(x='Survived',data=df,hue=df['Sex'])

### Number of people survived based on Passenger class

In [14]:
sns.countplot(x='Survived',data=df,hue=df['Pclass'])

## Feature Engineering

### Checking null values

In [15]:
df.shape

In [16]:
df.isnull().sum()

### Heatmap of Null Values

In [17]:
import matplotlib
matplotlib.rcParams['figure.figsize'] = (12,6)
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

### Dropping the Cabin Column

In [None]:
df.drop(['Cabin'],axis=1,inplace=True)

In [20]:
df.head()

### Dropping the Passenger ID Column

In [21]:
df.drop(['PassengerId'],axis=1,inplace=True)

In [22]:
df.head()

### Filling missing values in Age with median

In [23]:
df['Age'] = df['Age'].fillna(df['Age'].median())

In [24]:
df.isnull().sum()

### Dropping Ticket column

In [25]:
df.drop(['Ticket'],axis=1,inplace=True)

In [26]:
df.head()

### Dropping Name Column

In [27]:
df.drop(['Name'],axis=1,inplace=True)

In [28]:
df.head()

### Encoding categorical column "Embarked"

In [38]:
df['Embarked'].unique()

In [39]:
df['Embarked'].mode()

In [40]:
def embark(row):
    if row =='S':
        return 0
    elif row =='C':
        return 1
    elif row =='Q':
        return 2
    else:
        return 0
df['Embarked'] = df['Embarked'].apply(embark)

In [41]:
df.head()

In [42]:
df.isnull().sum()

### Filling missing value in Fare column with median

In [34]:
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

### Encoding the categorical column Sex

In [35]:
df['Sex'].unique()

In [36]:
def sexchange(row):
    if row == 'male':
        return 1
    else:
        return 0
df['Sex'] = df['Sex'].apply(sexchange)

In [43]:
df.isnull().sum()

In [44]:
df.info()

In [47]:
df_train1 = df[~df['Survived'].isnull()]
df_test1 = df[df['Survived'].isnull()]

In [48]:
df_test1.shape

In [49]:
df_train1.head()

In [50]:
df_test1.head()

In [51]:
df_test1 = df_test1.drop('Survived', axis = 1)
df_test1.head()

In [52]:
X = df_train1.drop('Survived', axis = 1)
X.head()

In [53]:
y = df_train1['Survived']
y.head()

## Splitting into train and test set

In [55]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

In [56]:
len(X_train)

In [57]:
len(y_train)

In [59]:
len(X_test)

In [60]:
len(y_test)

## Feature Scaling

In [61]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Training Logistic Regression Classifier

In [62]:
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

In [63]:
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [64]:
accuracies = cross_val_score(estimator=classifier , X=X_train , y=y_train , cv=10)
print('Accuracy: {:.2f} %'.format(accuracies.mean()*100))
print('Standard deviation: {:.2f} %'.format(accuracies.std()*100))

In [65]:
param_grid = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
grid = GridSearchCV(classifier, param_grid, refit = True, verbose = 3, cv=10)
grid.fit(X_train, y_train)

In [66]:
print(grid.best_params_)
print(grid.best_estimator_)

In [67]:
classifier = LogisticRegression(C=0.01, penalty='l2', random_state = 0)
classifier.fit(X_train, y_train)

In [68]:
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [70]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

In [71]:
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [73]:
pred_out = classifier.predict(sc.transform(df_test1))
pred_out

In [74]:
len(pred_out)

In [76]:
len(df_test.PassengerId)

In [78]:
output = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Survived': pred_out})
output.head(20)
output.to_csv('Titanic_Survival_submission.csv', index=False)