***EXERCISE 1 - BUILD A CLASSIFIER FOR THE MNIST DATASET THAT ACHIEVES OVER 97% ACCURACY ON THE TEST SET***

In [1]:
from scipy.io import arff
import pandas as pd
import os

data = arff.loadarff('%s/Downloads/mnist_784.arff'%os.environ['HOME'])
df = pd.DataFrame(data[0])
df.shape

(70000, 785)

In [2]:
X, y_raw = df.copy(), df['class']
X.drop('class',axis=1,inplace=True)

# Convert dtype of y to strings
y = y_raw.to_numpy('str')
print(X.shape)
print(y.shape)

(70000, 784)
(70000,)


In [5]:
# Split into training and test sets - NOTE: MNIST is already shuffled
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

Build a classifier that achieves > 97% accuracy

In [6]:
# Consider a KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Grid search on weights and n_neighbors hyperparameters
param_grid = [
    {'n_neighbors': [3,4,5,6], 'weights': ['uniform', 'distance'] }
]

# Init the classifier
knn_clf = KNeighborsClassifier()

# Create the grid_search
grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=True)

In [7]:
# Run the grid search - this takes awhile
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


KeyboardInterrupt: 

In [8]:
# If opening notebook from a previous session wherein grid_search has been run,
# simply set grid_search's best_params_. Otherwise, fetch best_params_ from instance above

grid_search.best_params_ = {'n_neighbors': 4, 'weights': 'distance'}
#grid_search.best_params_

In [13]:
cv_results = grid_search.cv_results_

In [16]:
import numpy as np

for mean_score, params in zip(cv_results['mean_test_score'], cv_results['params']):
    print(np.sqrt(mean_score), params)

0.9849196244702746 {'n_neighbors': 3, 'weights': 'uniform'}
0.9854525187276485 {'n_neighbors': 3, 'weights': 'distance'}
0.9841239759298622 {'n_neighbors': 4, 'weights': 'uniform'}
0.9857061766402129 {'n_neighbors': 4, 'weights': 'distance'}
0.9845218805762184 {'n_neighbors': 5, 'weights': 'uniform'}
0.9850972879196586 {'n_neighbors': 5, 'weights': 'distance'}
0.9839207285142436 {'n_neighbors': 6, 'weights': 'uniform'}
0.9855117114135851 {'n_neighbors': 6, 'weights': 'distance'}


In [9]:
# Best hyperparameters are:
#.  n_neighbors = 4
#.  weights = 'distance'
from sklearn.model_selection import cross_val_predict

# Now create a classifier with these parameters and fit to training data
knn_clf_best = KNeighborsClassifier(n_neighbors=4,weights='distance')

# Instead of fitting the test set, let's fit the training set via cross-validation
# This will give a sense of performance of classifier before applying it to test set
y_train_knn_clf_best = cross_val_predict(knn_clf_best, X_train, y_train, cv=5)

In [10]:
# Assess accuracy of knn_clf_best
from sklearn.metrics import accuracy_score

# Accuracy exceeds 97%! 
accuracy_score(y_train, y_train_knn_clf_best)

0.9716166666666667

In [11]:
# Now fit the knn_clf_best classifier
knn_clf_best.fit(X_train,y_train)

In [12]:
# Predict on the test set
pred = knn_clf_best.predict(X_test)

# Report accuracy without snooping the test set
print("Accuracy of knn_clf_best classifier on test set = %.2f %%"%
     (accuracy_score(y_test,pred)*100))

Accuracy of knn_clf_best classifier on test set = 97.14 %


***EXERCISE 2 - DATA AUGMENTATION: EXPAND MNIST DATASET BY SHIFTING IMAGES ONE PIXEL LEFT/RIGHT/UP/DOWN***

In [87]:
# Define a function to shift an image by one pixel up, down, left, or right
from scipy.ndimage.interpolation import shift
import matplotlib.pyplot as plt
import pandas as pd

def shiftDigit(digit,by=''):
    # Reshape the digit represented as a pandas series
    d = digit.values.reshape(28,28)
    
    # How to shift the digit
    if by == 'r':
        augDigit = shift(d, [0, 1], cval=0)
    elif by == 'l':
        augDigit = shift(d, [0, -1], cval=0)
    elif by == 'u':
        augDigit = shift(d, [-1, 0], cval=0)
    elif by == 'd':
        augDigit = shift(d, [1, 0], cval=0)
    else:
        raise ValueError("Invalid shift by control : %s"%by)
    
    return pd.Series(augDigit.reshape(28*28))

def showDigit(digit):
    plt.imshow(digit.values.reshape(28,28),cmap='binary')

  from scipy.ndimage.interpolation import shift


In [97]:
# Expand the labels; since shifted images correspond to same labels, simply duplicate the labels
import numpy as np
y_train_expand = np.concatenate((y_train,y_train,y_train,y_train,y_train))
y_train_expand.shape

(300000,)

In [88]:
# Shift each digit in training set left/right/up/down
X_train_l = X_train.copy()
X_train_r = X_train.copy()
X_train_u = X_train.copy()
X_train_d = X_train.copy()

In [94]:
# Apply shifting function
for i in range(X_train.shape[0]):
    X_train_l.iloc[i] = shiftDigit(X_train.iloc[i],by='l')
    X_train_r.iloc[i] = shiftDigit(X_train.iloc[i],by='r')
    X_train_u.iloc[i] = shiftDigit(X_train.iloc[i],by='u')
    X_train_d.iloc[i] = shiftDigit(X_train.iloc[i],by='d')

X_train_expand = pd.concat([X_train, X_train_l, X_train_r, X_train_u, X_train_d],axis=0)

In [99]:
# Fit the knn_clf_best classifier from exercise 1 to the expanded set
X_train_expand.shape, y_train_expand.shape

knn_clf_best.fit(X_train_expand, y_train_expand)

In [102]:
# Apply trained knn_clf_best on original test set
pred_expand = knn_clf_best.predict(X_test)
print("Accuracy of knn_clf_best classifier on test set = %.2f%%"%(accuracy_score(y_test,pred_expand)*100))

Accuracy of knn_clf_best classifier on test set = 97.63%


In [103]:
#####
# Augmenting the data only led to ~0.5% increase in accuracy when applied to the test set
#####

***EXERCISE 3 - THE TITANIC DATASET***

In [35]:
import pandas as pd

titanic_test = pd.read_csv('%s/kaggle/datasets/titanic/test.csv'%os.environ['HOME'])
titanic_train_raw = pd.read_csv('%s/kaggle/datasets/titanic/train.csv'%os.environ['HOME'])

In [36]:
# Shapes of train/test data
titanic_train_raw.shape, titanic_test.shape

((891, 12), (418, 11))

In [37]:
titanic_train_raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [45]:
# Extract the 'Survived' column from titanic_train_raw to use as labels
titanic_y_train = titanic_train_raw.Survived
titanic_train = titanic_train_raw.drop(['Survived'], axis=1)

# The Name of a passenger is conceivably irrelevant to their survival - drop it too
titanic_train = titanic_train.drop('Name', axis=1)

# Set the PassengerId as index
titanic_train.set_index('PassengerId',inplace=True)

In [46]:
titanic_train.shape, titanic_y_train.shape, titanic_test.shape

((891, 9), (891,), (418, 11))

In [47]:
titanic_train.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,male,22.0,1,0,A/5 21171,7.25,,S
2,1,female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,female,35.0,1,0,113803,53.1,C123,S
5,3,male,35.0,0,0,373450,8.05,,S


In [22]:
# Use titanic_train & titanic_y_train to build/train a binary classifier
# ... i.e., did a passenger survive or not

In [48]:
# Find columns with NaNs
for c in titanic_train.columns:
    if titanic_train[c].isnull().values.any():
        cntNull=titanic_train[c].isnull().sum()
        print("%s has %i NaNs, representing %.2f%% of total"%
              (c,cntNull,(100*cntNull/len(titanic_train[c]))))

Age has 177 NaNs, representing 19.87% of total
Cabin has 687 NaNs, representing 77.10% of total
Embarked has 2 NaNs, representing 0.22% of total


In [49]:
# With a substantial portion of 'Cabin' feature being NaN, consider dropping the column all together
# 'Age' and 'Embarked' features are NaN < 20% and < 1%, respectively. So consider
# using an imputer to replace missing 'Age' values with mean or median of said column,
# and drop DataFrame indices where Embarked is NaN
titanic_train = titanic_train.drop('Cabin',axis=1)
titanic_train.dropna(subset=['Embarked'],inplace=True)

from sklearn.impute import SimpleImputer

mean_imputer = SimpleImputer(strategy='mean')
med_imputer = SimpleImputer(strategy='median')

In [50]:
titanic_train['Embarked'].isnull().values.any(), titanic_train.shape

(False, (889, 8))

In [53]:
# Having dropped two Passengers missing an 'Embarked' attribute and dropping the 'Cabin' feature,
# proceed to apply Imputer for numerical attributes of a continous range (Age & Fare)
titanic_train_numeric = titanic_train.drop(['Pclass','Sex','SibSp','Parch','Ticket','Embarked'], axis=1)

# Train the mean/median imputers
for imp in [mean_imputer, med_imputer]:
    imp.fit(titanic_train_numeric)
    print(imp.statistics_)

[29.6420927  32.09668088]
[28.     14.4542]


In [58]:
# Transform missing values in 'Age' & 'Fare' features using mean imputer
M = mean_imputer.transform(titanic_train_numeric)

# Convert result to a DataFrame and map columns to titanic_train
titanic_train_numeric_filled = pd.DataFrame(M, columns=titanic_train_numeric.columns,
                                           index=titanic_train_numeric.index)

titanic_train.Age = titanic_train_numeric_filled.Age
titanic_train.Fare = titanic_train_numeric_filled.Fare

titanic_train.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,male,22.0,1,0,A/5 21171,7.25,S
2,1,female,38.0,1,0,PC 17599,71.2833,C
3,3,female,26.0,0,0,STON/O2. 3101282,7.925,S
4,1,female,35.0,1,0,113803,53.1,S
5,3,male,35.0,0,0,373450,8.05,S


In [65]:
# Isolate categorical features
titanic_train_cat = titanic_train.drop(['Pclass','Age','SibSp','Parch','Fare'], axis=1)

In [68]:
titanic_train_cat.Sex.describe()

count      889
unique       2
top       male
freq       577
Name: Sex, dtype: object

In [69]:
titanic_train_cat.Ticket.describe()

count        889
unique       680
top       347082
freq           7
Name: Ticket, dtype: object

In [70]:
titanic_train_cat.Embarked.describe()

count     889
unique      3
top         S
freq      644
Name: Embarked, dtype: object

In [71]:
# Consider OneHotEncoders for Sex and Embarked attributes, as each take on discrete values and aren't
# inherently ordered.
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()