In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/train.csv


**Imports**

In [2]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error
from math import sqrt

Load data files

In [3]:
df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_train.shape

(891, 12)

Descriptive Analytics (Stats + visuals)

In [4]:
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [6]:
df_train[pd.isnull(df_train.Embarked)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [7]:
# Missing Value Treatment for other parameters
# Remove Cabin as more than 50% of the values are null
df_train.Embarked = df_train.Embarked.fillna(df_train.Embarked.mode())

In [8]:
df_train['Title'] = df_train['Name'].map(lambda s: s[s.find(',')+len(','):s.find('.')].strip())
df_train = df_train.replace({'Title': {'Sir': 'Mr', 'Lady': 'Mrs', 'the Countess': 'Mrs', 'Mme': 'Miss', 'Capt': 'Mr', 'Don': 'Mr','Mlle': 'Miss','Ms':'Miss','Rev': 'Mr','Col': 'Mr','Jonkheer': 'Mr','Major': 'Mr'}})
df_train = pd.get_dummies(df_train, columns=['Title','Pclass','Embarked'])
print(df_train.columns)

Index(['PassengerId', 'Survived', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Title_Dr', 'Title_Master', 'Title_Miss',
       'Title_Mr', 'Title_Mrs', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')


In [9]:
# Treating Fare
df_train.Fare = np.where(df_train.Fare>200, df_train.Fare.mean(), df_train.Fare)

# Treating Pclass
df_train = df_train.astype({'Pclass': str})

# Treating Sex
df_train.Sex = np.where(df_train.Sex == 'male', 1, 0)

KeyError: 'Only a column name can be used for the key in a dtype mappings argument.'

In [10]:
df_age_train = df_train[pd.notnull(df_train.Age)]
df_age_test = df_train[pd.isnull(df_train.Age)]
print(df_age_train.shape, df_age_test.shape)
Age_X = df_age_train[['Fare', 'Title_Dr', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs']]
Age_y = df_age_train[['Age']]
Age_X_test = df_age_test[['Fare', 'Title_Dr', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs']]

(714, 21) (177, 21)


In [11]:
reg = LinearRegression()
reg.fit(Age_X,Age_y)
y_pred = reg.predict(Age_X)

In [12]:
print(sqrt(mean_squared_error(Age_y,y_pred)))

# Impute Age
df_age_test.Age = reg.predict(Age_X_test)

12.061080906358658


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [13]:
df_train = pd.concat([df_age_train, df_age_test])
df_train.shape

(891, 21)

In [14]:
df_train.isnull().sum()

PassengerId       0
Survived          0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin           687
Title_Dr          0
Title_Master      0
Title_Miss        0
Title_Mr          0
Title_Mrs         0
Pclass_1          0
Pclass_2          0
Pclass_3          0
Embarked_C        0
Embarked_Q        0
Embarked_S        0
dtype: int64

Data Preparation for Modeling

In [15]:
X = df_train[list(set(df_train.columns) - set(['Cabin','Survived','PassengerId','Ticket','Name']))]
y = df_train[['Survived']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 

Model Development & Selection

In [16]:
logit = LogisticRegression()
logit.fit(X_train, y_train)
y_pred = logit.predict(X_test)



ValueError: could not convert string to float: 'male'

In [17]:
# New Model Evaluation metrics 
print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred)))
print('Precision Score : ' + str(precision_score(y_test,y_pred)))
print('Recall Score : ' + str(recall_score(y_test,y_pred)))
print('F1 Score : ' + str(f1_score(y_test,y_pred)))

#Logistic Regression (Grid Search) Confusion matrix
metrics.confusion_matrix(y_test,y_pred)

ValueError: Found input variables with inconsistent numbers of samples: [268, 714]

In [18]:
df_test_raw = pd.read_csv('/kaggle/input/titanic/test.csv')
df_test_raw.shape

(418, 11)

In [19]:
df_test = df_test_raw.copy()

In [20]:
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [21]:
df_test['Title'] = df_test['Name'].map(lambda s: s[s.find(',')+len(','):s.find('.')].strip())
df_test = df_test.replace({'Title': {'Sir': 'Mr', 'Lady': 'Mrs', 'the Countess': 'Mrs', 'Mme': 'Miss', 'Capt': 'Mr', 'Don': 'Mr','Mlle': 'Miss','Ms':'Miss','Rev': 'Mr','Col': 'Mr','Jonkheer': 'Mr','Major': 'Mr','Dona':'Mrs'}})
df_test.Title.unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Dr'], dtype=object)

In [22]:
# Missing Value Treatment
df_test.Fare = df_test.Fare.fillna(df_test.Fare.mean())
df_train.Fare = np.where(df_train.Fare>200, df_train.Fare.mean(), df_train.Fare)
df_test.Embarked = df_test.Embarked.fillna(df_test.Embarked.mode())

# Preparation for model application
df_test = df_test.astype({'Pclass': str})
df_test.Sex = np.where(df_test.Sex == 'male', 1, 0)
df_test = pd.get_dummies(df_test, columns=['Pclass','Embarked','Title'])

In [23]:
df_age_train = df_test[pd.notnull(df_test.Age)]
df_age_test = df_test[pd.isnull(df_test.Age)]
print(df_age_train.shape, df_age_test.shape)
Age_X = df_age_train[['Fare', 'Title_Dr', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs']]
Age_y = df_age_train[['Age']]
Age_X_test = df_age_test[['Fare', 'Title_Dr', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs']]

(332, 20) (86, 20)


In [24]:
# Impute Age
df_age_test.Age = reg.predict(Age_X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [25]:
df_test = pd.concat([df_age_train, df_age_test])
df_test.shape

(418, 20)

In [26]:
df_test = df_test[list(set(df_test.columns).intersection(set(X_train.columns)))]

In [27]:
X_train.columns, df_test.columns

(Index(['Title_Mr', 'Pclass_3', 'Title_Miss', 'Age', 'Sex', 'Pclass_2', 'Parch',
        'Title_Dr', 'Embarked_C', 'Embarked_S', 'Fare', 'Title_Mrs',
        'Embarked_Q', 'Pclass_1', 'Title_Master', 'SibSp'],
       dtype='object'),
 Index(['Title_Mr', 'Pclass_3', 'Title_Miss', 'Age', 'Sex', 'Pclass_2',
        'Title_Dr', 'Parch', 'Embarked_C', 'Embarked_S', 'Fare', 'Title_Mrs',
        'Embarked_Q', 'Pclass_1', 'Title_Master', 'SibSp'],
       dtype='object'))

In [28]:
df_test_raw['Survived'] = logit.predict(df_test)

NotFittedError: This LogisticRegression instance is not fitted yet

In [29]:
df_test_raw[['PassengerId','Survived']].to_csv('gender_submission.csv')

KeyError: "['Survived'] not in index"