# Component 2: Classification

## Import statements:

In [1]:
# Imports the required libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier

In [2]:
# Imports titanic dataset
titanicTraining = pd.read_csv('train.csv')
titanicTest = pd.read_csv('test.csv')

## Dropping Features That Are Not Important:

In [3]:
# Stores the 'PassengerId' column for use in the Kaggle submission
passengerId = titanicTest['PassengerId']

In [4]:
# Drops features that are not important
titanicTraining.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
titanicTest.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

## Feature Engineering:

In [5]:
# Creates a new feature for family size, number of sibling/spouse + number of parents/children + themselves
titanicTraining['FamilySize'] = titanicTraining['SibSp'] + titanicTraining['Parch'] + 1
titanicTest['FamilySize'] = titanicTest['SibSp'] + titanicTest['Parch'] + 1

# Creates a new feature for if someone is alone, perhaps more likely to survive if no family to take care of?
titanicTraining['IsAlone'] = 0
titanicTest['IsAlone'] = 0

titanicTraining.loc[titanicTraining['FamilySize'] == 1, 'IsAlone'] = 1
titanicTest.loc[titanicTest['FamilySize'] == 1, 'IsAlone'] = 1

# The column 'SibSp' now seems redundant so it can be dropped
titanicTraining.drop(['SibSp'], axis=1, inplace=True)
titanicTest.drop(['SibSp'], axis=1, inplace=True)

## Imputing Missing Values:

In [6]:
# Creates an imputer that replaces NaN values with the median of the column values
numeric_imputer = SimpleImputer(strategy = 'median')

# Creates an imputer that replaces NaN values with the most frequent of the column values
categorical_imputer = SimpleImputer(strategy = 'most_frequent')

# Applies the corresponding imputers to the dataset columns
titanicTraining['Age'] = numeric_imputer.fit_transform(titanicTraining[['Age']])
titanicTraining['Embarked'] = categorical_imputer.fit_transform(titanicTraining[['Embarked']])
titanicTest['Age'] = numeric_imputer.fit_transform(titanicTest[['Age']])
titanicTest['Fare'] = numeric_imputer.fit_transform(titanicTest[['Fare']])

## Encoding Categorical Values:

In [7]:
# Stores the name of the columns with type 'object' in a list
object_columns = ['Sex', 'Embarked']

# Creates a one hot encoder that ignores classes not represented in the training data and returns a numpy array
encoder = OneHotEncoder(handle_unknown='ignore', sparse = False)

# Applies one hot encoder to columns with categorical data
encoded_data_train = pd.DataFrame(encoder.fit_transform(titanicTraining[object_columns]))
encoded_data_test = pd.DataFrame(encoder.fit_transform(titanicTest[object_columns]))

# Puts back index after it was removed by the encoder
encoded_data_train.index = titanicTraining.index
encoded_data_test.index = titanicTest.index

# Removes the categorical columns
titanicTrainingNumerical = titanicTraining.drop(object_columns, axis=1)
titanicTestNumerical = titanicTest.drop(object_columns, axis=1)

# Concatenates the numerical columns with the encoded categorical columns
titanicTrainingEncoded = pd.concat([titanicTrainingNumerical, encoded_data_train], axis=1)
titanicTestEncoded = pd.concat([titanicTestNumerical, encoded_data_test], axis=1)

# Rearranges the columns
titanicTrainingEncoded = titanicTrainingEncoded[['Age','Parch','FamilySize','IsAlone','Fare','Pclass',0,1,2,3,4,'Survived']]
titanicTestEncoded = titanicTestEncoded[['Age','Parch','FamilySize','IsAlone','Fare','Pclass',0,1,2,3,4]]

# Renames the encoded categorical columns to be more recognisable
titanicTrainingEncoded.rename(columns={0: 'Female', 1: 'Male', 2:'Cherbourg', 3:'Queenstown', 4:'Southampton'}, inplace=True)
titanicTestEncoded.rename(columns={0: 'Female', 1: 'Male', 2:'Cherbourg', 3:'Queenstown', 4:'Southampton'}, inplace=True)

## Separating Target Column From Data:

In [8]:
# Separates the target column from the rest of the data
trainingY = titanicTrainingEncoded['Survived'].map({True:1, False:0})
trainingX = titanicTrainingEncoded.drop('Survived', axis=1)
testX = titanicTestEncoded

## K-Nearest Neighbors Classification:

In [9]:
# Creates a new default KNN classifier
knn = KNeighborsClassifier()

# Creates a min-max scaler and transforms the data
scaler = MinMaxScaler()
scaled_trainingX = pd.DataFrame(scaler.fit_transform(trainingX), columns=trainingX.columns)
scaled_testX = pd.DataFrame(scaler.fit_transform(testX), columns=testX.columns)

# Performs 5-fold corss validation on the dataset and records the scores
cross_val_knn = cross_val_score(knn, scaled_trainingX, trainingY, cv=5)

# Prints the averages of the scores
np.mean(cross_val_knn)

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


0.7947060185751619

## Optimising Hyperparameters:

In [10]:
# Creates a new default logistic regression classifier
knn2 = KNeighborsClassifier()

# Creates the potential parameters for the classifier
neighbors = [3,5,11,19]
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan'] 

knn_parameters = {'n_neighbors': neighbors , 'weights': weights, 'metric': metric}

# Runs a search over the parameter values for the classifier
grid_knn = GridSearchCV(knn2, knn_parameters, cv=5, n_jobs=-1)

%time grid_knn.fit(scaled_trainingX, trainingY)

Wall time: 6.57 s


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_neighbors': [3, 5, 11, 19], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [11]:
# Prints the best score and best parameters from the search
grid_knn.best_score_, grid_knn.best_params_

(0.8024691358024691,
 {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'uniform'})

## Logistic Regression Classification:

In [12]:
# Creates a default logistic regression classifier
lgr = LogisticRegression(random_state=42)

# Performs 5-fold corss validation on the dataset and records the scores 
cross_val_lgr = cross_val_score(lgr, trainingX, trainingY, cv=5)

# Prints the averages of the scores
np.mean(cross_val_lgr)



0.7935070982311785

## Optimising Hyperparameters:

In [13]:
# Creates a new default logistic regression classifier
lgr2 = LogisticRegression(random_state=42)

# Creates the potential parameters for the classifier
penalties = ['l1', 'l2']
Cs = np.logspace(0, 4, 10)

regression_parameters = {'C': Cs,'penalty': penalties}

# Runs a search over the parameter values for the classifier
grid_lgr = GridSearchCV(lgr2, regression_parameters, cv=5, n_jobs=-1)

%time grid_lgr.fit(trainingX, trainingY)

Wall time: 397 ms




GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': array([1.00000e+00, 2.78256e+00, 7.74264e+00, 2.15443e+01, 5.99484e+01,
       1.66810e+02, 4.64159e+02, 1.29155e+03, 3.59381e+03, 1.00000e+04]), 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [14]:
# Prints the best score and best parameters from the search
grid_lgr.best_score_, grid_lgr.best_params_

(0.7934904601571269, {'C': 1.0, 'penalty': 'l2'})

## Random Forest Classification:

In [15]:
# Creates a random forest classifier that uses 100 trees (suggested default)
rfc = RandomForestClassifier(n_estimators = 100)

# Performs 5-fold corss validation on the dataset and records the scores 
cross_val_rfc = cross_val_score(rfc,trainingX, trainingY, cv=5)

# Prints the averages of the scores (may get better result after optimising hyperparameters)
np.mean(cross_val_rfc)

0.8025084873431929

## Optimising Hyperparameters:

In [16]:
# Creates a new default random forest classifier
rfc2 = RandomForestClassifier(n_estimators=64, random_state=42)

# Creates the potential parameters for the classifier
depths = np.arange(1,11)
features = np.arange(1, trainingX.shape[1] + 1)

tree_parameters = {'max_depth': depths,'max_features': features}

# Runs a search over the parameter values for the classifier
grid_rfc = GridSearchCV(rfc2, tree_parameters, cv=5, n_jobs=-1)

%time grid_rfc.fit(trainingX, trainingY)

Wall time: 3.94 s


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=64, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]), 'max_features': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [17]:
# Prints the best score and best parameters from the search
grid_rfc.best_score_, grid_rfc.best_params_

(0.8428731762065096, {'max_depth': 8, 'max_features': 9})

## Voting Classifier (Ensemble Method):

In [18]:
knn_best = KNeighborsClassifier(n_neighbors=5, weights='uniform', metric='manhattan')
lgr_best = LogisticRegression(C=1.0, penalty='l2')
rfc_best = RandomForestClassifier(n_estimators=64, max_depth=9, max_features=9, random_state=42)

models = [('knn', knn_best), ('lgr', lgr_best), ('rfc', rfc_best)]

ensemble = VotingClassifier(models, voting='hard')

## Kaggle Submission:

### K-Nearest Neighbors:

In [19]:
knn_final = KNeighborsClassifier(n_neighbors=5, weights='uniform', metric='manhattan') 
knn_final.fit(scaled_trainingX, trainingY)
predictions = knn_final.predict(scaled_testX)

output = pd.DataFrame({'PassengerId': passengerId, 'Survived': predictions})
output.to_csv('knn.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


### Logistic Regression:

In [20]:
lgr_final = LogisticRegression(C=1.0, penalty='l2')
lgr_final.fit(trainingX, trainingY)
predictions = lgr_final.predict(testX)

output = pd.DataFrame({'PassengerId': passengerId, 'Survived': predictions})
output.to_csv('logisticRegression.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!




### Random Forest:

In [21]:
rfc_final = RandomForestClassifier(n_estimators=64, max_depth=9, max_features=9, random_state=42)
rfc_final.fit(trainingX, trainingY)
predictions = rfc_final.predict(testX)

output = pd.DataFrame({'PassengerId': passengerId, 'Survived': predictions})
output.to_csv('randomForest.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


### Voting Classifier:

In [22]:
ensemble.fit(trainingX, trainingY)
predictions = ensemble.predict(testX)

output = pd.DataFrame({'PassengerId': passengerId, 'Survived': predictions})
output.to_csv('ensembleHard.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


