In [1]:
# python version 3.10.7
# scikit-learn == 1.1.2
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef

# load datasets

In [2]:
df = pd.read_csv(r"..\data_files\ecq_rank_0_or_1_cond_limit_5000_one_per_isog_class_balanced.csv")
print(f"We are using {len(df.columns)} a_p's for this dataset.")

# create a normalize df: df_norm where 
# df_norm[p] = df[p]/sqrt(p)
df_norm = df.copy()
cols = df_norm.columns[:-1]
for p in cols:
    df_norm[p] = (df[p]) / int(np.sqrt(int(p)))

We are using 201 a_p's for this dataset.


# create train and test data

In [3]:
from sklearn.model_selection import train_test_split
# create train and test sets from df
# shuffle=False
# default test_size=0.25
X = df.drop(columns=['rank']).values
y = df['rank'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)

# create train and test sets from df_norm
# shuffle=False
# default test_size=0.25
X_norm = df_norm.drop(columns=['rank']).values
y_norm = df_norm['rank'].values
X_train_norm, X_test_norm, y_train_norm, y_test_norm = train_test_split(X_norm, y_norm, shuffle=False)

# models

In [4]:
# available models: SVM, Random Forest, Logistic Regression
models = {'SVM' : [SVC, {'kernel' : 'linear'}],
          'Random Forest' : [RandomForestClassifier, {'n_estimators' : 100}], 
          'Logistic Regression' : [LogisticRegression, {'max_iter' : 3000}]
          }

In [5]:
# create a df to store the results
results = pd.DataFrame(columns=['Model', 'Accuracy on train set', 'MCC on test set', 'Accuracy on train set (Normalized)', 'MCC on test set (Normalized)'])

for model_name in models:
    # create a model
    params = models[model_name][1]
    model = models[model_name][0]
    clf = model(**params)
    
    # train a svm model for X_train and y_train
    clf.fit(X_train, y_train)
    # show the accuracy of the model on the train data
    y_train_pred = clf.predict(X_train)
    train_accuracy = clf.score(X_train, y_train)
    # show the MCC of the model on the test data
    y_test_pred = clf.predict(X_test)
    test_mcc_score = matthews_corrcoef(y_test, y_test_pred)
    # print(f"For NON-NORMALZED dateset: Accuracy on train set: {clf.score(X_train, y_train)}. MCC on test set: {test_mcc_score}")

    # --------------------------------------------
    # train a svm model for X_train_norm and y_train_norm
    clf.fit(X_train_norm, y_train_norm)
    # show the accuracy of the model on the train data
    norm_y_train_pred = clf.predict(X_train_norm)
    norm_train_accuracy = clf.score(X_train_norm, y_train_norm)
    # show the MCC of the model on the train and data
    norm_y_test_pred = clf.predict(X_test_norm)
    norm_test_mcc_score = matthews_corrcoef(y_test_norm, norm_y_test_pred)
    # print(f"For NORMALZED dateset: Accuracy on train set: {clf.score(X_train_norm, y_train_norm)}. MCC on test set: {norm_test_mcc_score}")

    # --------------------------------------------
    # store the results in the results df
    results = results.append({'Model': model_name, 
                              'Accuracy on train set': train_accuracy, 
                              'MCC on test set': test_mcc_score,
                              'Accuracy on train set (Normalized)': norm_train_accuracy, 
                              'MCC on test set (Normalized)': norm_test_mcc_score}, 
                              ignore_index=True)
    

  results = results.append({'Model': model_name,
  results = results.append({'Model': model_name,
  results = results.append({'Model': model_name,


# results

In [6]:
results

Unnamed: 0,Model,Accuracy on train set,MCC on test set,Accuracy on train set (Normalized),MCC on test set (Normalized)
0,SVM,1.0,0.992883,1.0,0.992908
1,Random Forest,1.0,0.843438,1.0,0.818511
2,Logistic Regression,1.0,0.992908,1.0,0.98938


In [7]:
# # demo using SVM model
# model_name = 'SVM'

# # create a model
# clf = models[model_name]()
# # train a svm model for X_train and y_train
# clf.fit(X_train, y_train)
# # show the accuracy of the model on the train and data
# y_test_pred = clf.predict(X_test)
# test_mcc_score = matthews_corrcoef(y_test, y_test_pred)
# print(f"For NON-NORMALZED dateset: Accuracy on train set: {clf.score(X_train, y_train)}. MCC on test set: {test_mcc_score}")

# # --------------------------------------------
# # train a svm model for X_train_norm and y_train_norm
# clf.fit(X_train_norm, y_train_norm)
# # show the accuracy of the model on the train and data
# norm_y_test_pred = clf.predict(X_test_norm)
# norm_test_mcc_score = matthews_corrcoef(y_test_norm, norm_y_test_pred)
# print(f"For NORMALZED dateset: Accuracy on train set: {clf.score(X_train_norm, y_train_norm)}. MCC on test set: {norm_test_mcc_score}")