## Imports

In [2]:
import pandas as pd
import numpy as np
import time
from memory_profiler import memory_usage
import os
import io
import sys
import warnings


from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.exceptions import ConvergenceWarning
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA

from google.colab import drive

warnings.filterwarnings("ignore", category=ConvergenceWarning)


## dataset creation

In [3]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# This is a dataset about iris flowers information and classification of its species
df_iris = pd.read_csv('/content/drive/MyDrive/TCC/datasets i could use/Iris.csv')

# This is a dataset about pacients physical health and classification of weather they have a low (0) or high (1) risk of heart disease
df_heart = pd.read_csv('/content/drive/MyDrive/TCC/datasets i could use/heart.csv')

In [5]:
df_iris

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [6]:
df_heart

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


## Analysis

In [65]:
class Analysis:
    def __init__(self, dataframe, technique, model='SVM'):
        self.memory_measurements = []

        self.prepare_data(dataframe)
        self.memory_measurements.append(memory_usage()[0])

        self.perform_analysis(technique, model)
        self.memory_measurements.append(memory_usage()[0])

        self.max_memory_usage = max(self.memory_measurements)

    def prepare_data(self, dataframe):
        # Drop 'id' column and shuffle the dataframe
        if 'id' in dataframe.columns:
            dataframe.drop('id', axis=1, inplace=True)
        dataframe = dataframe.sample(frac=1, random_state=42).reset_index(drop=True)

        # Define X matrix and y column
        self.X = dataframe.iloc[:, :-1]
        self.y = dataframe.iloc[:, -1]

    def perform_analysis(self, technique, model):
        kf = KFold(n_splits=5)
        f1_scores = []

        for train_index, test_index in kf.split(self.X):
            self.split_data(train_index, test_index)

            if technique == 'PCA':
                self.apply_pca()
            elif technique == 'IncPCA':
                self.apply_ipca()

            self.apply_normalization()

            f1 = self.select_model_and_get_f1(model)
            f1_scores.append(f1)

        self.f1_total = np.mean(f1_scores)

    def split_data(self, train_index, test_index):
        self.X_train, self.X_test = self.X.iloc[train_index], self.X.iloc[test_index]
        self.y_train, self.y_test = self.y.iloc[train_index], self.y.iloc[test_index]

    def apply_pca(self):
        pca = PCA(n_components=0.95)
        self.X_train = pca.fit_transform(self.X_train)
        self.memory_measurements.append(memory_usage()[0])
        self.X_test = pca.transform(self.X_test)

    def apply_ipca(self):
        pca = PCA()
        pca.fit(self.X_train)
        n_components = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.95) + 1
        ipca = IncrementalPCA(n_components=n_components)
        self.X_train = ipca.fit_transform(self.X_train)
        self.memory_measurements.append(memory_usage()[0])
        self.X_test = ipca.transform(self.X_test)


    def apply_normalization(self):
        scaler = StandardScaler()
        self.X_train = scaler.fit_transform(self.X_train)
        self.memory_measurements.append(memory_usage()[0])
        self.X_test = scaler.transform(self.X_test)

    def select_model_and_get_f1(self, model):
        if model == 'SVM':
            return self.get_f1_score(SVC(kernel='linear'))
        elif model == 'MLP':
            return self.get_f1_score(MLPClassifier(hidden_layer_sizes=(50,), activation='relu', solver='adam', max_iter=300))
        elif model == 'Tree':
            return self.get_f1_score(DecisionTreeClassifier())
        elif model == 'KNN':
          return self.get_f1_score(KNeighborsClassifier())
        elif model == 'LogReg':
          return self.get_f1_score(LogisticRegression())

    def get_f1_score(self, classifier):
        classifier.fit(self.X_train, self.y_train)
        self.memory_measurements.append(memory_usage()[0])
        self.y_pred = classifier.predict(self.X_test)
        return f1_score(self.y_test, self.y_pred, average='weighted')

## Plot

## Main

In [62]:
results = pd.DataFrame(columns = ['dataset', 'technique', 'model', 'f1_score', 'processing_time', 'memory_usage'])

In [66]:
def run(dataset, technique, model):

    if dataset == 'df_iris':
        dataset = df_iris
    elif dataset == 'df_heart':
        dataset = df_heart

    a = Analysis(dataset, technique, model)
    return a

In [67]:
for dataset in ['df_iris', 'df_heart']:
    for technique in ['None', 'PCA', 'IncPCA']:
        for model in ['SVM', 'MLP', 'Tree', 'KNN', 'LogReg']:
            start_time = time.time()

            a = run(dataset, technique, model)

            end_time = time.time()
            processing_time = end_time - start_time

            #mem_usage = np.mean(memory_usage((run, (dataset, technique, model, ), {})))

            results.loc[len(results)] = [dataset, technique, model, a.f1_total, processing_time, a.max_memory_usage]

In [59]:
results

Unnamed: 0,dataset,technique,model,f1_score,processing_time,memory_usage
0,df_iris,,SVM,1.0,1.258982,304.40625
1,df_iris,,MLP,0.993333,1.990684,304.40625
2,df_iris,,Tree,0.993347,1.255287,304.40625
3,df_iris,,KNN,0.993333,1.270578,304.40625
4,df_iris,,LogReg,0.993333,1.2971,304.40625
5,df_iris,PCA,SVM,0.973329,1.772282,304.40625
6,df_iris,PCA,MLP,0.980009,2.690476,304.40625
7,df_iris,PCA,Tree,1.0,1.754153,304.40625
8,df_iris,PCA,KNN,0.993347,1.772685,304.40625
9,df_iris,PCA,LogReg,0.986701,1.772357,304.40625


In [68]:
a.memory_measurements

[304.55078125,
 304.55078125,
 304.55078125,
 304.55078125,
 304.55078125,
 304.55078125,
 304.55078125,
 304.55078125,
 304.55078125,
 304.55078125,
 304.55078125,
 304.55078125,
 304.55078125,
 304.55078125,
 304.55078125,
 304.55078125,
 304.55078125]

In [28]:
a.f1_total

0.6952179190791334

In [29]:
processing_time

0.08985185623168945