In [1]:
import pandas as pd
import os
import numpy as np
from pandas import DataFrame

train_df = pd.read_csv(os.path.join(os.getcwd(), 'train.csv'))
test_df = pd.read_csv(os.path.join(os.getcwd(), 'test.csv'))

def convert_to_features_and_target(df: DataFrame, is_training_data=True):
  # Create a dataframe for the set of descriptive features
  features_df = df.iloc[:, 1:-1] if is_training_data else df.iloc[:, 1:]
  features_set = features_df.values
  
  # Convert the features to a two-dimensional array and the target to a one-dimensional array
  target_df = df.iloc[:, -1] if is_training_data else None
  target_set = target_df.values if target_df is not None else None

  return [features_set, target_set]

[training_features_set, training_target_set] = convert_to_features_and_target(train_df, is_training_data=True)
[testing_features_set, _] = convert_to_features_and_target(test_df, is_training_data=False)

### 3.1 Producing bottom-line models

In [None]:
# Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

random_forest_clf = RandomForestClassifier(random_state=0)
random_forest_clf.fit(training_features_set, training_target_set)

In [None]:
# k-Nearest Neighbors Classifier

from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(training_features_set, training_target_set)

In [None]:
# SVM Classifier

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

svm_clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
svm_clf.fit(training_features_set, training_target_set)

### 3.2 Tuning the bottom-line models

In [None]:
# Tuning the Random Forest Classifier

"""
RandomizedSearchCV yielded a higher prediction score than GridSearchCV for the Random Forest Classifier.
"""

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

tuning_params = {
  'n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=10)], # 10 evenly-spaced integers between 100 and 1000
  'max_features': ['sqrt', 'log2'],
  'max_depth': [int(x) for x in np.linspace(10, 110, num=11)] + [None], # 11 evenly-spaced integers between 100 and 110
  'min_samples_split': [2, 5, 10],
  'min_samples_leaf': [1, 2, 4],
  'bootstrap': [True, False],
  'criterion': ['gini', 'entropy']
}

tuned_random_forest_clf = RandomizedSearchCV(RandomForestClassifier(), tuning_params, cv=5)
tuned_random_forest_clf.fit(training_features_set, training_target_set)
print("Best params:", tuned_random_forest_clf.best_params_)
print("Best score:", tuned_random_forest_clf.best_score_)

Best params: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 60, 'criterion': 'entropy', 'bootstrap': False}
Best score: 0.9282686270827185


In [None]:
# Tuning the k-Nearest Neighbors Classifier

"""
GridSearchCV yielded a higher prediction score than RandomizedSearchCV for the k-Nearest Neighbors Classifier.
"""

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

tuning_params = {
  'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
  'weights': ['uniform', 'distance'],
  'algorithm': ['ball_tree', 'kd_tree', 'brute'],
  'leaf_size': [10, 20],
  'p': [1, 2]
}
tuned_knn_clf = GridSearchCV(KNeighborsClassifier(), tuning_params, cv=5)
tuned_knn_clf.fit(training_features_set, training_target_set)
print("Best score:", tuned_knn_clf.best_score_)
print("Best params:", tuned_knn_clf.best_params_)

Best score: 0.9246642161466019
Best params: {'weights': 'distance', 'p': 1, 'n_neighbors': 44, 'leaf_size': 100, 'algorithm': 'kd_tree'}
Best estimator: KNeighborsClassifier(algorithm='kd_tree', leaf_size=100, n_neighbors=44, p=1,
                     weights='distance')


In [None]:
# Tuning the SVM Classifier

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC

"""
GridSearchCV yielded a higher prediction score than RandomizedSearchCV for the SVM Classifier.
"""

tuning_params = {
  'C': [0.1, 1, 10, 100],
  'kernel': ['sigmoid', 'rbf'],
  'degree': [2, 3, 4],
  'gamma': ['scale'],
  'coef0': [0.0, 0.1, 0.5]
}
tuned_svm_clf = GridSearchCV(SVC(), tuning_params, cv=5)
tuned_svm_clf.fit(training_features_set, training_target_set)
print("Best score:", tuned_svm_clf.best_score_)
print("Best params:", tuned_svm_clf.best_params_)

Best score: 0.919751012851815
Best params: {'C': 100, 'coef0': 0.0, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}


### 3.3 Selecting the final model and producing predictions

In [None]:
final_model = tuned_random_forest_clf # The Random Forest Classifier model has the highest predictive score
final_model.predict(testing_features_set) # Use it to produce predictions on the test dataset (Also check out the Python script.py file)

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,