# Imports

In [41]:
import pandas as pd
import numpy as np
from sklearn.decomposition import KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
import matplotlib.pyplot as plt

# **Data preprocessing**

In [2]:
random_state = 1000
df = pd.read_csv("Analysis.csv")

In [3]:
# Dropping all rows containing a na value
df.dropna(inplace=True)

In [None]:
df.info()

In [5]:
# Setting the data set as labels and values
df = df.drop(['index','id', 'name'], axis=1)
y = df[['label']]
X = df.drop('label', axis=1)
df = df.drop('label', axis=1)

**Scaling the data**

In [6]:
scale = StandardScaler()
X = scale.fit_transform(X)
X = pd.DataFrame(X, columns=df.columns)

# **Applying feature extraction and LDA**

In [29]:
# Applying dimensionality reduction with Kernel PCA so that the classes are distributed as far from each other as possible
PCA = KernelPCA(n_components = 2, kernel = 'rbf') # Adjust the components accordingly
X_reduced = PCA.fit_transform(X)

In [30]:
# Applying Linear Discriminant Analysis so that the classes are more compact
classes = y.nunique()[0]
LDA = LinearDiscriminantAnalysis()
X_reduced_lda = LDA.fit_transform(X_reduced,y.values.ravel())

**Visualizing the data**

In [None]:
dfc = pd.concat([pd.DataFrame(X_reduced_lda), y], axis=1)

# Separate the data based on the unique values in the 'label' column (assuming the label column is named 'label')
unique_labels = dfc['label'].unique()

# Create a scatter plot for each unique label
for label in unique_labels:
    plt.scatter(dfc[dfc['label'] == label][0], dfc[dfc['label'] == label][1], label=label, s=5)  # Set the size to 10

# Set labels and title
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Scatter Plot of Features with Labels')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))  # Show legend

# Show the plot
plt.show()

It is evident that reducing the dataset into two dimensions takes away a lot of information regarding the classes actual variance.

# **Training the models**

**Metrics method**

In [66]:
# Defining a method to calculate the accuracy, precision, recall and f1 score for a model

def metrics (y_true, y_pred):
  return (accuracy_score(y_true, y_pred),
          precision_score(y_true, y_pred, average='weighted', zero_division=1),
          recall_score(y_true, y_pred, average='weighted'),
          f1_score(y_true, y_pred, average='weighted'))

In [7]:
# Splitting the dataset into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)

**Random Forest**

In [8]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train.values.ravel())
y_pred_rf = random_forest.predict(X_test)

In [62]:
metrics_rf = metrics(y_test, y_pred_rf)
metrics_rf

(0.5305210918114144, 0.529612998873515, 0.5305210918114144, 0.5290614856493208)

**MLP**

In [132]:
# Applying a grid search method first to decide the optimal MLP configuration
param_grid = {'activation':['relu', 'tanh', 'logistic', 'identity'], 'alpha':[0.1, 0.01, 0.001, 0.0001], 'learning_rate':['constant', 'adaptive'], 'solver':['adam', 'sgd']}
grid = GridSearchCV(estimator=MLPClassifier(),param_grid= param_grid, refit = True, verbose=3, cv=2)

In [None]:
grid.fit(X_train, y_train)

In [134]:
grid.best_params_

{'activation': 'tanh',
 'alpha': 0.001,
 'learning_rate': 'adaptive',
 'solver': 'adam'}

In [10]:
mlp = MLPClassifier(activation='tanh', alpha=0.001, learning_rate='adaptive', solver='adam', max_iter=1000 )

In [11]:
mlp.fit(X_train, y_train.values.ravel())
y_pred_mlp = mlp.predict(X_test)

In [70]:
metrics_mlp = metrics(y_test, y_pred_mlp)
metrics_mlp

(0.4784119106699752,
 0.5383369989219166,
 0.4784119106699752,
 0.45202306231151645)

**KNN**

In [17]:
classes = y.nunique()[0]
knn = KNeighborsClassifier(n_neighbors = classes)

In [None]:
knn.fit(X_train, y_train.values.ravel())

In [20]:
y_pred_knn = knn.predict(X_test)

In [71]:
metrics_knn = metrics(y_test, y_pred_knn)
metrics_knn

(0.5915632754342431,
 0.6053323690659862,
 0.5915632754342431,
 0.5872018019170631)

# **Plotting learning curves**

In [None]:
classifiers = [random_forest, mlp, knn]
classifier_names = ['RandomForest', 'MLP', 'KNN']

# Plot learning curves for each classifier
plt.figure(figsize=(12, 8))

for classifier, name in zip(classifiers, classifier_names):
    train_sizes, train_scores, test_scores = learning_curve(classifier, X_train, y_train.values.ravel(), cv=2, scoring='accuracy')

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.plot(train_sizes, train_scores_mean, label=f'{name} - Training Score', marker='o')
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2)

    plt.plot(train_sizes, test_scores_mean, label=f'{name} - Cross-validation Score', marker='o')
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2)

plt.title('Learning Curves for Classifiers')
plt.xlabel('Training Examples')
plt.ylabel('Score')
plt.legend(loc='best')
plt.grid(True)
plt.show()

# **Comparing results on original dataset vs dataset after KPCA-LDA**

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X_reduced_lda, y, test_size=0.3, random_state=random_state)

In [73]:
random_forest.fit(X_train, y_train.values.ravel())
y_pred_rf = random_forest.predict(X_test)
metrics_rf_red = metrics(y_test, y_pred_rf)

In [74]:
mlp.fit(X_train, y_train.values.ravel())
y_pred_mlp = mlp.predict(X_test)
metrics_mlp_red = metrics(y_test, y_pred_mlp)

In [75]:
knn.fit(X_train, y_train.values.ravel())
y_pred_knn = knn.predict(X_test)
metrics_knn_red = metrics(y_test, y_pred_knn)

In [81]:
print(metrics_rf_red, metrics_rf)
print(metrics_mlp_red, metrics_mlp)
print(metrics_knn_red, metrics_knn) # Results are similar so feature extraction
                                    # doesn't appear to be useful

(0.5230769230769231, 0.5213323533619217, 0.5230769230769231, 0.5215818955523066) (0.5305210918114144, 0.529612998873515, 0.5305210918114144, 0.5290614856493208)
(0.4794044665012407, 0.5393908283488321, 0.4794044665012407, 0.45308801547487404) (0.4784119106699752, 0.5383369989219166, 0.4784119106699752, 0.45202306231151645)
(0.47096774193548385, 0.46902363287155246, 0.47096774193548385, 0.4665689679665557) (0.5915632754342431, 0.6053323690659862, 0.5915632754342431, 0.5872018019170631)
