In [1]:
import pandas as pd
import numpy as np
import random
import time
import sys
import os

sys.path.append(os.path.join("..", ".."))

# Word Embedding
from sentence_transformers import SentenceTransformer

# Created functions
from functions.Experimentations import ParamSearch
random.seed(0)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
vector_length = 768

# Data Import

In [None]:
# Data import
# Relative Path of the dataset, change for your dataset

dataset_name_cpn27 = "cpn27"
dataset_name_cpn120 = "cpn120"
# Options are "cpn27" and "cpn120"

type_standardization = "normalize" 
# options are "raw", "normalize", "normalize_wo_stop", and "lemmatize"

# import of the data
data_cpn27 = pd.read_csv(f"normalize/{dataset_name_cpn27.upper()}_{type_standardization}.csv", delimiter=",")
data_cpn27 = data_cpn27.fillna(value='')

data_cpn120 = pd.read_csv(f"normalize/{dataset_name_cpn120.upper()}_{type_standardization}.csv", delimiter=",")
data_cpn120 = data_cpn120.fillna(value='')

# Model

In [4]:
model = SentenceTransformer('sentence-transformers/LaBSE')

In [None]:
# Timer
start = time.time()

# Creation of a matrix full of 0 to save the vectors of each feature
descriptions_matrix = np.zeros( 
    (
        data_cpn27.shape[0], # the number of data points
        vector_length # the number of components of the word embedding
    )
)

# Matrix filling with the vectors of each point
for i,description in enumerate(data_cpn27.iloc[:,1]):
    vector = model.encode(description)
    descriptions_matrix[i,] = vector

# Concatenate the matrix with the data of each observation
data_matrix_cpn27 = np.concatenate([descriptions_matrix,data_cpn27], axis=1)

# Remove of the 'Nan' values in the data
data_matrix_cpn27 = data_matrix_cpn27[~pd.isnull(data_matrix_cpn27[:,:vector_length]).any(axis=1)]


end = time.time()
print("Execution time:", end-start)

Exceution time: 223.61245012283325


In [None]:
# Timer
start = time.time()

# Creation of a matrix full of 0 to save the vectors of each feature
descriptions_matrix = np.zeros( 
    (
        data_cpn120.shape[0], # the number of data points
        vector_length # the number of components of the word embedding
    )
)

# Matrix filling with the vectors of each point
for i,description in enumerate(data_cpn27.iloc[:,1]):
    vector = model.encode(description)
    descriptions_matrix[i,] = vector

# Concatenate the matrix with the data of each observation
data_matrix_cpn120 = np.concatenate([descriptions_matrix,data_cpn120], axis=1)

# Remove of the 'Nan' values in the data
data_matrix_cpn120 = data_matrix_cpn120[~pd.isnull(data_matrix_cpn120[:,:vector_length]).any(axis=1)]


end = time.time()
print("Execution time:", end-start)

# AC-PLT

In [None]:
from functions.Experimentations import ParamSearch
from functions.AC_PLT import AC_PLT

parameters = {'n_clusters': np.arange(50, 1800, 50)}
ac_plt = AC_PLT()

file_name_cpn27 = "results/"+r'{}_{}_{}_results.csv'.format(dataset_name_cpn27, type_standardization, ac_plt.__class__.__name__)

clf_acplt = ParamSearch(ac_plt, parameters)
clf_acplt.fit(data_matrix_cpn27[:, :vector_length], data_matrix_cpn27[:, vector_length+2],file_name_cpn27)



file_name_cpn120 = "results/"+r'{}_{}_{}_results.csv'.format(dataset_name_cpn120, type_standardization, ac_plt.__class__.__name__)

clf_acplt = ParamSearch(ac_plt, parameters)
clf_acplt.fit(data_matrix_cpn120[:, :vector_length], data_matrix_cpn120[:, vector_length+2],file_name_cpn120)



<functions.Experimentations.ParamSearch at 0x235d2a7a650>

# Naive Bayes

In [None]:
from functions.Experimentations import ParamSearch
from sklearn.naive_bayes import GaussianNB

# parameters = {'var_smoothing': np.geomspace(1e-06, 1e+02, num=9)}
parameters = {'var_smoothing': (5, 10, 20, 30, 40, 50, 60, 70, 80, 90)}
proc = GaussianNB()

file_name_cpn27 = "results/"+r'{}_{}_{}_results.csv'.format(dataset_name_cpn27, type_standardization, proc.__class__.__name__)

clf = ParamSearch(proc, parameters)
clf.fit(data_matrix_cpn27[:, :vector_length], data_matrix_cpn27[:, vector_length+2],file_name_cpn27)


file_name_cpn120 = "results/"+r'{}_{}_{}_results.csv'.format(dataset_name_cpn120, type_standardization, proc.__class__.__name__)

clf = ParamSearch(proc, parameters)
clf.fit(data_matrix_cpn120[:, :vector_length], data_matrix_cpn120[:, vector_length+2],file_name_cpn120)

# SVC

In [None]:
from functions.Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='rbf'
parameters = {'C': np.geomspace(1e-03, 1e+01, num=6)}
svc_rbf = SVC(kernel=kernel, gamma='auto')
file_name_cpn27 = "results/"+r'{}_{}_{}_{}_results.csv'.format(dataset_name_cpn27, type_standardization, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix_cpn27[:, :vector_length], data_matrix_cpn27[:, vector_length+2],file_name_cpn27)

file_name_cpn120 = "results/"+r'{}_{}_{}_{}_results.csv'.format(dataset_name_cpn120, type_standardization, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix_cpn120[:, :vector_length], data_matrix_cpn120[:, vector_length+2],file_name_cpn120)

In [None]:
from functions.Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='rbf'
parameters = {'C': np.geomspace(1e02, 1e+04, num=4)}
svc_rbf = SVC(kernel=kernel, gamma='auto')
file_name_cpn27 = "results/"+r'{}_{}_{}_{}_results.csv'.format(dataset_name_cpn27, type_standardization, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix_cpn27[:, :vector_length], data_matrix_cpn27[:, vector_length+2],file_name_cpn27)

file_name_cpn120 = "results/"+r'{}_{}_{}_{}_results.csv'.format(dataset_name_cpn120, type_standardization, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix_cpn120[:, :vector_length], data_matrix_cpn120[:, vector_length+2],file_name_cpn120)

In [None]:
from functions.Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='linear'
# parameters = {'C': np.geomspace(1e-05, 1e+01, num=7)}
parameters = {'C': [1, 2, 3, 4, 5, 6]}
svc_rbf = SVC(kernel=kernel, gamma='auto')

file_name_cpn27 = "results/"+r'{}_{}_{}_{}_results.csv'.format(dataset_name_cpn27, type_standardization, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix_cpn27[:, :vector_length], data_matrix_cpn27[:, vector_length+2],file_name_cpn27)

file_name_cpn120 = "results/"+r'{}_{}_{}_{}_results.csv'.format(dataset_name_cpn120, type_standardization, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix_cpn120[:, :vector_length], data_matrix_cpn120[:, vector_length+2],file_name_cpn120)

# clf_acplt.to_csv()

In [None]:
from functions.Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='poly'
parameters = {'degree': np.arange(1,7)}
svc_rbf = SVC(kernel=kernel, gamma='auto')
file_name_cpn27 = "results/"+r'{}_{}_{}_{}_results.csv'.format(dataset_name_cpn27, type_standardization, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix_cpn27[:, :vector_length], data_matrix_cpn27[:, vector_length+2],file_name_cpn27)

file_name_cpn120 = "results/"+r'{}_{}_{}_{}_results.csv'.format(dataset_name_cpn120, type_standardization, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix_cpn120[:, :vector_length], data_matrix_cpn120[:, vector_length+2],file_name_cpn120)

In [None]:
from functions.Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='sigmoid'
parameters = {'C': np.geomspace(1e-05, 1e+01, num=7)}
svc_rbf = SVC(kernel=kernel, gamma='auto')
file_name_cpn27 = "results/"+r'{}_{}_{}_{}_results.csv'.format(dataset_name_cpn27, type_standardization, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix_cpn27[:, :vector_length], data_matrix_cpn27[:, vector_length+2],file_name_cpn27)

file_name_cpn120 = "results/"+r'{}_{}_{}_{}_results.csv'.format(dataset_name_cpn120, type_standardization, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix_cpn120[:, :vector_length], data_matrix_cpn120[:, vector_length+2],file_name_cpn120)

# Decision Tree

In [None]:
from functions.Experimentations import ParamSearch
from sklearn.tree import DecisionTreeClassifier

criterion = 'gini'
parameters = {'max_leaf_nodes': np.arange(100, 1001, 100)}
desition_tree = DecisionTreeClassifier(criterion=criterion)
file_name_cpn27 = "results/"+r'{}_{}_{}_{}_results.csv'.format(dataset_name_cpn27, type_standardization, desition_tree.__class__.__name__, criterion)

clf_acplt = ParamSearch(desition_tree, parameters)
clf_acplt.fit(data_matrix_cpn27[:, :vector_length], data_matrix_cpn27[:, vector_length+2],file_name_cpn27)

file_name_cpn120 = "results/"+r'{}_{}_{}_{}_results.csv'.format(dataset_name_cpn120, type_standardization, desition_tree.__class__.__name__, criterion)

clf_acplt = ParamSearch(desition_tree, parameters)
clf_acplt.fit(data_matrix_cpn120[:, :vector_length], data_matrix_cpn120[:, vector_length+2],file_name_cpn120)


# KNN

In [None]:
from functions.Experimentations import ParamSearch
from sklearn.neighbors import KNeighborsClassifier

parameters = {'n_neighbors': np.arange(10, 501, 10)}
knn = KNeighborsClassifier()
file_name_cpn27 = "results/"+r'{}_{}_{}_results.csv'.format(dataset_name_cpn27, type_standardization, knn.__class__.__name__)

clf_acplt = ParamSearch(knn, parameters)
clf_acplt.fit(data_matrix_cpn27[:, :vector_length], data_matrix_cpn27[:, vector_length+2],file_name_cpn27)


file_name_cpn120 = "results/"+r'{}_{}_{}_results.csv'.format(dataset_name_cpn120, type_standardization, knn.__class__.__name__)

clf_acplt = ParamSearch(knn, parameters)
clf_acplt.fit(data_matrix_cpn120[:, :vector_length], data_matrix_cpn120[:, vector_length+2],file_name_cpn120)

# XGBoost


## CPN27


In [None]:
cod = pd.DataFrame(data_matrix_cpn27).iloc[:,vector_length+2].value_counts()
reduce_cod = cod[cod<5]
n=5
data_fill_cpn27 = pd.DataFrame(data_matrix_cpn27).copy()

for key, value in reduce_cod.items():
    m=np.abs(n-value)
    nrow = np.zeros(vector_length)
    nrow = np.concatenate([nrow, np.array(['', '', key])])
    for i in range(m): data_fill_cpn27 = np.vstack([data_fill_cpn27,nrow])

In [None]:
y = data_fill_cpn27[:, vector_length+2]

labels = np.unique(y)
i=0
idx2class = {}
class2idx= {}
for tp in labels:
    idx2class[i] = tp
    class2idx[tp] = i
    i += 1

y_label = np.vectorize(class2idx.get)(y)

In [None]:
from xgboost import XGBClassifier
from functions.Experimentations import ParamSearch

max_levels = [5,10,50,100]

parameters = {'max_depth': max_levels, 'n_estimators': np.arange(1, 16, 1)}

bst = XGBClassifier(learning_rate=1, objective='multi:softprob', random_state=0)
file_name_cpn27 = "results/"+r'{}_{}_{}_results.csv'.format(dataset_name_cpn27, type_standardization, bst.__class__.__name__)

clf_acplt = ParamSearch(bst, parameters)
clf_acplt.fit(data_fill_cpn27[:, :vector_length], y_label, file_name_cpn27)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from functions.Experimentations import ParamSearch

max_levels = [5,10,50,100]

parameters = {'max_depth': max_levels}
rndforest = RandomForestClassifier(random_state=0)
file_name_cpn27 = "results/"+r'{}_{}_{}_results.csv'.format(dataset_name_cpn27, type_standardization, rndforest.__class__.__name__)

clf_acplt = ParamSearch(rndforest, parameters)
clf_acplt.fit(data_fill_cpn27[:, :vector_length], y_label, file_name_cpn27)

## CPN120


In [None]:
cod = pd.DataFrame(data_matrix_cpn120).iloc[:,vector_length+2].value_counts()
reduce_cod = cod[cod<5]
n=5
data_fill_CPN120 = pd.DataFrame(data_matrix_cpn120).copy()

for key, value in reduce_cod.items():
    m=np.abs(n-value)
    nrow = np.zeros(vector_length)
    nrow = np.concatenate([nrow, np.array(['', '', key])])
    for i in range(m): data_fill_CPN120 = np.vstack([data_fill_CPN120,nrow])

In [None]:
y = data_fill_CPN120[:, vector_length+2]

labels = np.unique(y)
i=0
idx2class = {}
class2idx= {}
for tp in labels:
    idx2class[i] = tp
    class2idx[tp] = i
    i += 1

y_label = np.vectorize(class2idx.get)(y)

In [None]:
from xgboost import XGBClassifier
from functions.Experimentations import ParamSearch

max_levels = [5,10,50]

parameters = {'max_depth': max_levels, 'n_estimators': np.arange(1, 16, 1)}

bst = XGBClassifier(learning_rate=1, objective='multi:softprob', random_state=0)
file_name_cpn120 = "results/"+r'{}_{}_{}_results.csv'.format(dataset_name_cpn120, type_standardization, bst.__class__.__name__)

clf_acplt = ParamSearch(bst, parameters)
clf_acplt.fit(data_fill_CPN120[:, :vector_length], y_label, file_name_cpn120)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from functions.Experimentations import ParamSearch

max_levels = [5,10,50]

parameters = {'max_depth': max_levels}
rndforest = RandomForestClassifier(random_state=0)
file_name_cpn120 = "results/"+r'{}_{}_{}_results.csv'.format(dataset_name_cpn120, type_standardization, rndforest.__class__.__name__)

clf_acplt = ParamSearch(rndforest, parameters)
clf_acplt.fit(data_fill_CPN120[:, :vector_length], y_label, file_name_cpn120)