# Importing Training and Testing Data

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

train_data = pd.read_csv('data/train.csv')

X_t = train_data.drop(['category', 'ID'], axis=1)
y_t = train_data['category']

test_data = pd.read_csv('data/test.csv')

X_train, X_test, y_train, y_test = train_test_split(X_t, y_t, test_size=0.1, random_state=42)

# Approach 3 - New Pipeline

# Imports

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from pipeline_components import *
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.cluster import DBSCAN

In [None]:
def generate_submission(pipeline, test_data):
    from datetime import datetime

    X_test = test_data.drop(['ID'], axis=1)
    y_pred = pipeline.predict(X_test)

    submission = pd.DataFrame({'ID': test_data['ID'], 'Category': y_pred})
    submission.to_csv(f"submissions/submission_{(datetime.now()).strftime('%Y_%m_%d-%H_%M')}.csv", index=False)


In [None]:
pipeline = Pipeline([
    ("PCA 250", PCA(n_components=0.99)),
    ("LDA 19", LinearDiscriminantAnalysis(n_components=19)),
    ("MLP", MLPClassifier(activation='relu', solver='lbfgs', alpha=10, hidden_layer_sizes=(100, 100, 100, 100, 100), random_state=1, max_iter=1000)),
    ])
# MLPClassifier(activation='relu', solver='lbfgs', alpha=10, hidden_layer_sizes=(319), random_state=1)

pipeline.fit(X_t, y_t)
print("Pipeline done")
cross_val_scores=cross_val_score(pipeline, X_t, y_t, cv=5)

In [None]:
cross_val_scores.mean(), cross_val_scores.std()

In [None]:
generate_submission(pipeline, test_data)

In [10]:
def dbscan_grid_search(X_data, lst, clst_count, eps_space = 0.5,
                       min_samples_space = 5, min_clust = 0, max_clust = 10):
    """
Performs a hyperparameter grid search for DBSCAN.

Parameters:
    * X_data            = data used to fit the DBSCAN instance
    * lst               = a list to store the results of the grid search
    * clst_count        = a list to store the number of non-whitespace clusters
    * eps_space         = the range values for the eps parameter
    * min_samples_space = the range values for the min_samples parameter
    * min_clust         = the minimum number of clusters required after each search iteration in order for a result to be appended to the lst
    * max_clust         = the maximum number of clusters required after each search iteration in order for a result to be appended to the lst


Example:

# Loading Libraries
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Loading iris dataset
iris = datasets.load_iris()
X = iris.data[:, :] 
y = iris.target

# Scaling X data
dbscan_scaler = StandardScaler()

dbscan_scaler.fit(X)

dbscan_X_scaled = dbscan_scaler.transform(X)

# Setting empty lists in global environment
dbscan_clusters = []
cluster_count   = []


# Inputting function parameters
dbscan_grid_search(X_data = dbscan_X_scaled,
                   lst = dbscan_clusters,
                   clst_count = cluster_count
                   eps_space = pd.np.arange(0.1, 5, 0.1),
                   min_samples_space = pd.np.arange(1, 50, 1),
                   min_clust = 3,
                   max_clust = 6)

"""
    # Importing counter to count the amount of data in each cluster
    from collections import Counter

    # Starting a tally of total iterations
    n_iterations = 0

    # Looping over each combination of hyperparameters
    for eps_val in eps_space:
        for samples_val in min_samples_space:

            dbscan_grid = DBSCAN(eps = eps_val,
                                 min_samples = samples_val)

            # fit_transform
            clusters = dbscan_grid.fit_predict(X = X_data)

            # Counting the amount of data in each cluster
            cluster_count = Counter(clusters)

            # Saving the number of clusters
            n_clusters = sum(abs(pd.np.unique(clusters))) - 1

            # Increasing the iteration tally with each run of the loop
            n_iterations += 1

            # Appending the lst each time n_clusters criteria is reached
            if n_clusters >= min_clust and n_clusters <= max_clust:

                lst.append([eps_val,
                            samples_val,
                            n_clusters])

                clst_count.append(cluster_count)

    # Printing grid search summary information
    print(f"""Search Complete. \nYour list is now of length {len(lst)}. """)
    print(f"""Hyperparameter combinations checked: {n_iterations}. \n""")
    
    return lst

# apply PCA to reduce dimensionality
pca = PCA(n_components=415)
X_pca = pca.fit_transform(X_t)

# apply LDA to reduce dimensionality
lda = LinearDiscriminantAnalysis(n_components=19)
X_lda = lda.fit_transform(X_pca, y_t)

# scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_lda)

    

In [23]:
# apply DBSCAN
dbscan_clusters = []
cluster_count = []
optimal_hyperparams = dbscan_grid_search(X_data = X_scaled,
                        lst = dbscan_clusters,
                        clst_count = cluster_count,
                        eps_space = pd.np.arange(0.1, 5, 0.1),
                        min_samples_space = pd.np.arange(1, 50, 1),
                        min_clust = 3,
                        max_clust = 6)



  eps_space = pd.np.arange(0.1, 5, 0.1),
  min_samples_space = pd.np.arange(1, 50, 1),
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.

Search Complete. 
Your list is now of length 117. 
Hyperparameter combinations checked: 2401. 



  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1
  n_clusters = sum(abs(pd.np.unique(clusters))) - 1


In [24]:
# Evaluate performance of each hyperparameter combination
from sklearn.metrics import silhouette_score

scores = []
for eps, min_samples, n_clusters in optimal_hyperparams:
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    clusters = dbscan.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, clusters)
    scores.append(score)

# Choose optimal hyperparameters
optimal_index = np.argmax(scores)
optimal_hyperparam = optimal_hyperparams[optimal_index]

# sort optimal hyperparameters by silhouette score, keeping only the optimal hyperparameters
optimal_hyperparams = sorted(zip(optimal_hyperparams, scores), key=lambda x: x[1], reverse=True)

In [25]:
print(optimal_hyperparam)
print(optimal_hyperparams)

[3.2, 41, 6]
[([3.2, 41, 6], 0.11487205813933835), ([3.3000000000000003, 36, 3], 0.11359020188634901), ([3.3000000000000003, 37, 3], 0.11359020188634901), ([3.3000000000000003, 38, 3], 0.11314005775953512), ([3.3000000000000003, 41, 6], 0.11299394465616752), ([3.3000000000000003, 42, 3], 0.11247105424326777), ([3.3000000000000003, 43, 3], 0.11245126789685028), ([3.3000000000000003, 44, 3], 0.11227382731724155), ([3.3000000000000003, 45, 3], 0.11201809338860873), ([3.3000000000000003, 46, 3], 0.11162842552887862), ([3.2, 27, 6], 0.11101855307600997), ([3.3000000000000003, 39, 3], 0.11094262793000977), ([3.2, 29, 6], 0.11038500705011993), ([3.2, 32, 3], 0.11027095258813273), ([3.2, 28, 6], 0.11004224298369375), ([3.1, 25, 6], 0.11003346263711933), ([3.2, 30, 6], 0.10992213493086682), ([3.2, 44, 6], 0.10973889647205937), ([3.1, 24, 6], 0.1096689101438101), ([3.1, 21, 6], 0.10946954017676354), ([3.1, 22, 6], 0.10946954017676354), ([3.1, 23, 6], 0.10946954017676354), ([3.2, 33, 3], 0.109005