Import libraries

In [1]:
#make sure we install scikit-learn before we import 
!pip install -U scikit-learn

Collecting scikit-learn
[?25l  Downloading https://files.pythonhosted.org/packages/30/aa/db462d385c56905b731403885454188683f63c86ea68900f6f7e7558b5fa/scikit_learn-0.24.0-cp36-cp36m-manylinux2010_x86_64.whl (22.2MB)
[K     |████████████████████████████████| 22.2MB 1.4MB/s 
[?25hCollecting threadpoolctl>=2.0.0
  Downloading https://files.pythonhosted.org/packages/f7/12/ec3f2e203afa394a149911729357aa48affc59c20e2c1c8297a60f33f133/threadpoolctl-2.1.0-py3-none-any.whl
Installing collected packages: threadpoolctl, scikit-learn
  Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.24.0 threadpoolctl-2.1.0


In [None]:
import numpy as np
import pandas as pd
import sklearn

from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.datasets.samples_generator import make_blobs

from sklearn.metrics.cluster import contingency_matrix
from scipy.special import comb
import matplotlib.pyplot as plt

import pylab as pl
from sklearn.decomposition import PCA

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report
from sklearn.svm import SVC

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

Clustering matrix function

In [None]:
def clustering_metrics(y_true, y_pred):
    
    # obtain contingency matrix: P * C
    # P is the number of ground truth clusters
    # C is the number of clusters produced by the alogrithm
    cm = contingency_matrix(y_true, y_pred)
    
    # Refer to the page: https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.comb.html
    # We can use the comb function to calculate “N choose k”
    #
    # For rand index and F1
    # step 1: tp + fp
    tp_plus_fp = np.sum(comb(np.sum(cm,axis=0),2))
    
    # step 2: tp + fn
    tp_plus_fn = np.sum(comb(np.sum(cm,axis=1),2))
    
    # step 3: tp
    tp = np.sum(comb(cm,2))
    
    # step 4: fp, fn, tn
    fp = tp_plus_fp - tp
    fn = tp_plus_fn - tp
    tn = comb(np.sum(cm),2) - tp - fp - fn
    
    # rand index
    rand_index = (tp + tn) / (tp + fp + fn + tn)
    # F1
    F1 = 2 * tp / (2 * tp + fp + fn)
    
    #-------------------
    # For purity  
    numerator = np.sum(cm.max(axis=0))
    denominator = np.sum(cm)
    purity = numerator / denominator
    
    return rand_index, F1, purity

Import training and testing data set  

In [None]:
# Import data
df_train = pd.read_csv('train_data.csv', sep=',')
df_test = pd.read_csv('test_data.csv', sep=',')

Data pre-processing

1. Balance the number of each classes

In [None]:
# Make sure class = 0 and class = 1 balances

# Put data rows with class = 0 in df_train0
df_train0 = df_train[df_train['Class'] == 0]
# Put data rows with class = 1 in df_train1
df_train1 = df_train[df_train['Class'] == 1]

In [None]:
# Sample number of rows (same with numer of rows of df_train1) from df_train0 
df_train0 = df_train0.sample(n = df_train1.shape[0], replace = False)

In [None]:
# Concatenate df_train0 and df_train1
df_train = pd.concat([df_train0, df_train1], axis = 0)

2. Factorize "Sector" column for both training and testing data set

In [None]:
# Factorize sectors and put it into newly created Sector_fac column
df_train['Sector_fac'] = pd.factorize(df_train['Sector'])[0]
sector_index = pd.factorize(df_train['Sector'])[1]
sector_dict = {k: v for v, k in enumerate(sector_index)}

In [None]:
# Replace Sector column values with sector_dict (dictionary) reated above
df_test.replace(sector_dict, inplace = True)

In [None]:
train_data = df_train.values
test_data = df_test.values

In [None]:
train_labels = list(train_data[:, -2])

Minor adjustment for each data set

In [None]:
# Delete company name from train_features
train_features = np.delete(train_data, 0, 1)
# Delete class column from train_features
train_features = np.delete(train_features, -2, 1)
# Delete sector column from train_features (adjusted)
train_features = np.delete(train_features, -2, 1)

# Delete company name from test_features
test_features = np.delete(test_data, 0, 1)

Imputation

In [None]:
# preprocessing
imp = KNNImputer(n_neighbors=5, weights='distance') # impute the data with knn value
imp.fit(train_features)
KNNImputer()
imputed_train_features = imp.transform(train_features)
imputed_test_features = imp.transform(test_features)

Feature engineering

In [None]:
sel = VarianceThreshold(threshold = (.6 * (1 - .6)))
reduced_features = sel.fit_transform(imputed_train_features)

In [None]:
# Only 200 columns left
reduced_features.shape

(1922, 198)

In [None]:
# Originally, there were 222 columns
imputed_train_features.shape

(1922, 222)

Our main classifier (with all columns included)

In [None]:
imputed_test_features.shape

(1488, 222)

In [None]:
imputed_train_features.shape

(1922, 222)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(imputed_train_features, train_labels, test_size = 0.2, random_state = 100)

In [None]:
X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(reduced_features, train_labels, test_size = 0.2, random_state = 100)

In [None]:
# classifier
# training
clf = KNeighborsClassifier(n_neighbors=5) # KNN classifier
clf.fit(X_train, y_train)
# prediction
predictions = clf.predict(X_test)

In [None]:
# evaluation metrics
rand_index, F1, purity = clustering_metrics(y_test, predictions)
print('Kmeans: K =', 5)
print('Rand index =', rand_index)
print('F1 =', F1)
print('Purity =', purity)

Kmeans: K = 5
Rand index = 0.5096861471861471
F1 = 0.5092612651646448
Purity = 0.574025974025974


Experimental classifier (with reduced columns included)

In [None]:
# classifier
# training
clf_temp = KNeighborsClassifier(n_neighbors=5) # KNN classifier
clf_temp.fit(X_train_temp, y_train_temp)
# prediction
predictions_temp = clf_temp.predict(X_test_temp)

In [None]:
# evaluation metrics
rand_index_temp, F1_temp, purity_temp = clustering_metrics(y_test_temp, predictions_temp)
print('Kmeans: K =', 5)
print('Rand index =', rand_index_temp)
print('F1 =', F1_temp)
print('Purity =', purity_temp)

Kmeans: K = 5
Rand index = 0.5096861471861471
F1 = 0.5092612651646448
Purity = 0.574025974025974


Final model

In [None]:
## trying out multiple values for k
k_range = range(1,31)
## 
weights_options=['uniform','distance']
# 
param = {'n_neighbors':k_range, 'weights':weights_options}
## Using startifiedShufflesplit. 
cv = StratifiedShuffleSplit(n_splits=10, test_size=.30, random_state=15)
grid = GridSearchCV(KNeighborsClassifier(), param,cv=cv,verbose = False, n_jobs=-1)

## Fitting the model. 
grid.fit(imputed_train_features,train_labels)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=15, test_size=0.3,
            train_size=None),
             error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': range(1, 31),
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=False)

In [None]:
#show best parameters
grid.best_params_

{'n_neighbors': 26, 'weights': 'distance'}

In [None]:
# classifier
# You can try any classifiers
# training
clf = KNeighborsClassifier(n_neighbors=26,weights='distance') # KNN classifier
clf.fit(imputed_train_features, train_labels)

#show accuracy
acc_knn = round(clf.score(imputed_train_features, train_labels) * 100, 2)
print(acc_knn)

# prediction
predictions = clf.predict(imputed_test_features)

98.13


In [None]:
# write to submission file
df_sub = pd.read_csv('sampleSubmission.csv', sep=',')
df_sub['Class'] = predictions
df_sub.to_csv ('final_submission2.csv', index = False, header=True)
print('done!')

done!
