In [None]:
# Prepare the toolbox

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from imblearn.pipeline import Pipeline
from sklearn.decomposition import PCA

In [None]:
# Load clean and pre-labeled DataFrame

df = pd.read_csv("./data/psth_data_IC_preclassified.csv",index_col='id',keep_default_na=False)
df

In [None]:
# Check for NaN values

any(df.isna().sum())

In [None]:
# Heuristic classification report

display(df['pattern'].value_counts().drop(''))
print(sum(df['pattern'] != ''), "recordings met the criteria to be labeled in one category.")
print(round(sum(df['pattern'] == '')*100/len(df['pattern']),1), "% of the recordings remain unlabeled.")

In [None]:
# Separate labeled from unlabeled data

df_labeled = df[df['pattern'] != '']
df_unlabeled = df[df['pattern'] == '']

# X-y Split

y = df_labeled['pattern']
X = df_labeled.drop(columns='pattern')

# TRAIN-TEST SPLIT

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Balancing

In [None]:
balance_1=df_labeled['pattern'].value_counts() #value counts of the target directly from the dataset
print(balance_1)

### 1 - TOMEK Links undersampling

First, we apply an undersampling technique to the X/y train dataframes from the X/y train/test split. This will serve to "soften" the clusters of each category.

In [None]:
from imblearn.under_sampling import TomekLinks
tl = TomekLinks(sampling_strategy='not minority') # initialize Tomek Links

#Apply undersampling to X_train and y_train
X_train_tl_1, y_train_tl_1 = tl.fit_resample(X_train, y_train)  #X_train and y_train after TL 1
display(y_train_tl_1.value_counts())
balance_2=y_train_tl_1.value_counts() #value counts of the target after TL 1

### 2 - SMOTE

The imbalancing has improved, but it is not balanced enough yet. We next apply oversampling with SMOTE.

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_train_sm, y_train_sm = smote.fit_resample(X_train_tl_1, y_train_tl_1)  #X_train and y_train after SMOTE
display(y_train_sm.value_counts())
balance_3=y_train_sm.value_counts() #value counts of the target after SMOTE

As expected, the oversampling has completely balanced the values in the y_train dataset. However, this is not real: many "fake" values have been added, and this could generate artificial predictions from our model.

### 3 - TOMEK Links polishing

In [None]:
#Apply undersampling again to X_train_sm and y_train_sm
tl_ = TomekLinks(sampling_strategy='all') # change strategy because if they are all the same it won't do anything
X_train_tl_2, y_train_tl_2 = tl_.fit_resample(X_train_sm, y_train_sm)  #X_train and y_train after TL 2
display(y_train_tl_2.value_counts())
balance_4=y_train_tl_2.value_counts() #value counts of the target after TL 2

The balance of our data is seemingly good now. We proceed to train our model and make predictions.

# Model Evaluation

In [None]:
# Apply logistic regression: initialize Logistic Regression and fit scaled TRAIN X and TRAIN y data (target)

clf = SVC() #probability=True
clf.fit(X_train_tl_2, y_train_tl_2)

In [None]:
#Score and predictions

print("Score =",round(clf.score(X_test, y_test), 3))
predictions = clf.predict(X_test)
#pred_probs = clf.predict_proba(X_test) # To get the probability of each class
display(pd.Series(predictions).value_counts())
display(y_test.value_counts())

ConfusionMatrixDisplay(
    confusion_matrix = confusion_matrix(y_test, predictions, labels=clf.classes_),
    display_labels = clf.classes_
).plot(cmap='Blues')
plt.show()

print(classification_report(y_test, predictions))

In [None]:
# Probabilities of each class

#probs = pd.DataFrame(pred_probs, columns=clf.classes_, index=X_test.index)
#unsure = probs[probs.max(axis=1)<0.99] # Get dubious cases
#unsure

# MODEL APPLICATION

In [None]:
df_predicted = df_unlabeled.copy().drop(columns='pattern')
df_predicted['pattern'] = clf.predict(df_predicted)
df_predicted

In [None]:
# Check results

pattern = 'sustain' # Select the pattern you want to check

# Stimulus: pure tone (75 ms). Interstimulus interval of 250 ms (4 Hz presentation rate)
tone_x, tone_y = [0,75],[-0.05,-0.05] # info for Matplotlib

for id in df_predicted[df_predicted['pattern'] == pattern].index:
    df_predicted.drop(columns='pattern').loc[id].plot(kind='line')
    plt.title(id)
    plt.ylim([-0.1, 1])
    plt.xlabel('Time (ms)')
    plt.ylabel('Spike density (norm)')
    tone, = plt.plot(tone_x, tone_y, marker = 'o')
    tone.set_label('Tone')
    plt.legend()
    plt.show()