In [1]:
import pandas as pd
import numpy as np
import random
import time
import sys
import os

sys.path.append(os.path.join("..", ".."))

# Word Embedding
from sentence_transformers import SentenceTransformer

# Created functions
from functions.Experimentations import ParamSearch
random.seed(0)

  from .autonotebook import tqdm as notebook_tqdm


# Variables 

In [2]:
from hydra import initialize, compose

with initialize(version_base=None, config_path="../../conf"):
    cfg=compose(config_name="main.yaml")

# Data Import

In [3]:
# Data import
# Relative Path of the dataset, change for your dataset

dataset_name = "cpn120"
# Options are "cpn27" and "cpn120"

type_standardization = "normalize" 
# options are "raw", "normalize", "normalize_wo_stop", and "lemmatize"

# import of the data
data = pd.read_csv(cfg.path[type_standardization][dataset_name], delimiter=",")
data = data.fillna(value='')

# Model

In [4]:
model = SentenceTransformer('intfloat/e5-base-v2')
# model = SentenceTransformer('intfloat/multilingual-e5-base')

In [5]:
# Timer
start = time.time()

descriptions_matrix = np.zeros( # creatign an empty matrix
    (
        len(data),                                       # the number of data points
        cfg.params.vector_length.sentence_embedding      # the number of components of the word embedding
    )
)

# Matrix filling 
# Change to the name of the descriptions of your dataset.
for i,description in enumerate(data.iloc[:,1]):
    vector = model.encode(description)
    descriptions_matrix[i,] = vector

# Concatenate the matrix with the data of each observation
data_matrix = np.concatenate([descriptions_matrix,data], axis=1)


# Remove of the 'Nan' data
data_matrix = data_matrix[~pd.isnull(data_matrix[:,:cfg.params.vector_length.sentence_embedding]).any(axis=1)]


end = time.time()
print("Exceution time:", end-start)

Exceution time: 931.6799569129944


# AC-PLT

In [6]:
from functions.AC_PLT import AC_PLT

parameters = {'n_clusters': np.arange(50, 1800, 50)}
ac_plt = AC_PLT()

file_name = cfg.path_results.E5+r'{}_{}_{}_results.csv'.format(dataset_name, type_standardization, ac_plt.__class__.__name__)

clf_acplt = ParamSearch(ac_plt, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.sentence_embedding], data_matrix[:, cfg.params.vector_length.sentence_embedding+2],file_name)



<functions.Experimentations.ParamSearch at 0x26c1295b2b0>

# Naive Bayes

In [None]:
from functions.Experimentations import ParamSearch
from sklearn.naive_bayes import GaussianNB

# parameters = {'var_smoothing': np.arange(1, 10, 0.5)}
parameters = {'var_smoothing': (0.1, 0.2, 0.3, 0.4, 0.5, 0.6)}
proc = GaussianNB()

file_name = r'../data/results/E5/{}_{}_results.csv'.format(dataset_name, proc.__class__.__name__)

clf = ParamSearch(proc, parameters)
clf.fit(data_matrix[:, :cfg.params.vector_length.sentence_embedding], data_matrix[:, cfg.params.vector_length.sentence_embedding+2],file_name)

# SVC

In [None]:
from sklearn.svm import SVC

kernel='rbf'
parameters = {'C': np.geomspace(1e-05, 1e+01, num=7)}
svc_rbf = SVC(kernel=kernel, gamma='auto')
fileName = r'{}_{}{}_results.csv'.format(dataset_name, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.sentence_embedding], data_matrix[:, cfg.params.vector_length.sentence_embedding+2], fileName)

In [None]:
from Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='linear'
parameters = {'C': np.geomspace(1e-05, 1e+01, num=7)}
svc_linear = SVC(kernel=kernel, gamma='auto')
fileName = r'{}_{}{}_results.csv'.format(dataset_name, svc_linear.__class__.__name__, kernel)


clf_acplt = ParamSearch(svc_linear, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.sentence_embedding], data_matrix[:, cfg.params.vector_length.sentence_embedding+2])

In [6]:
from Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='linear'
parameters = {'C': np.arange(1,11, 0.5)}
svc_linear = SVC(kernel=kernel, gamma='auto')
fileName = r'{}_{}{}_results.csv'.format(dataset_name, svc_linear.__class__.__name__, kernel)


clf_acplt = ParamSearch(svc_linear, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.sentence_embedding], data_matrix[:, cfg.params.vector_length.sentence_embedding+2], fileName)



In [None]:
from sklearn.svm import SVC

kernel='poly'
parameters = {'degree': np.arange(1,7)}
svc_poly = SVC(kernel=kernel, gamma='auto')
fileName = r'{}_{}{}_results.csv'.format(dataset_name, svc_poly.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_poly, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.sentence_embedding], data_matrix[:, cfg.params.vector_length.sentence_embedding+2], fileName)

In [None]:
from sklearn.svm import SVC

kernel='sigmoid'
parameters = {'C': np.geomspace(1e-05, 1e+01, num=7)}
svc_sigmoid = SVC(kernel=kernel, gamma='auto')
fileName = r'{}_{}{}_results.csv'.format(dataset_name, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_sigmoid, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.sentence_embedding], data_matrix[:, cfg.params.vector_length.sentence_embedding+2], fileName)

# Decision Tree

In [6]:
from sklearn.tree import DecisionTreeClassifier

criterion = 'gini'
parameters = {'max_leaf_nodes': np.arange(500, 601, 100)}
desition_tree = DecisionTreeClassifier(criterion=criterion)
fileName = r'../../../data/results/E5/{}_{}_{}_results.csv'.format(dataset_name, desition_tree.__class__.__name__, criterion)

clf_acplt = ParamSearch(desition_tree, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.sentence_embedding], data_matrix[:, cfg.params.vector_length.sentence_embedding+2],fileName)



<functions.Experimentations.ParamSearch at 0x190b9d73040>

# XGBoost

In [7]:
cod = pd.DataFrame(data_matrix).iloc[:,cfg.params.vector_length.sentence_embedding+2].value_counts()
reduce_cod = cod[cod<5]
n=5
data_fill = pd.DataFrame(data_matrix).copy()

for key, value in reduce_cod.items():
    m=np.abs(n-value)
    nrow = np.zeros(cfg.params.vector_length.sentence_embedding)
    nrow = np.concatenate([nrow, np.array(['', '', key])])
    for i in range(m): data_fill = np.vstack([data_fill,nrow])

In [8]:
y = data_fill[:,cfg.params.vector_length.sentence_embedding+2]

labels = np.unique(y)
i=0
idx2class = {}
class2idx= {}
for tp in labels:
    idx2class[i] = tp
    class2idx[tp] = i
    i += 1

y_label = np.vectorize(class2idx.get)(y)

In [9]:
from xgboost import XGBClassifier
from functions.Experimentations import ParamSearch

max_levels = [5,10,50,100,200,300]

parameters = {'max_depth': max_levels}

bst = XGBClassifier(learning_rate=1, objective='multi:softprob', random_state=0, multi_strategy="multi_output_tree")
fileName = r'../data/results/E5/{}_{}_results.csv'.format(dataset_name, bst.__class__.__name__)

clf_acplt = ParamSearch(bst, parameters)
clf_acplt.fit(data_fill[:, :cfg.params.vector_length.sentence_embedding], y_label, fileName)


Parameters: { "multi_strategy" } are not used.

Parameters: { "multi_strategy" } are not used.

Parameters: { "multi_strategy" } are not used.

Parameters: { "multi_strategy" } are not used.

Parameters: { "multi_strategy" } are not used.

Parameters: { "multi_strategy" } are not used.

Parameters: { "multi_strategy" } are not used.

Parameters: { "multi_strategy" } are not used.

Parameters: { "multi_strategy" } are not used.

Parameters: { "multi_strategy" } are not used.

Parameters: { "multi_strategy" } are not used.

Parameters: { "multi_strategy" } are not used.

Parameters: { "multi_strategy" } are not used.

Parameters: { "multi_strategy" } are not used.

Parameters: { "multi_strategy" } are not used.

Parameters: { "multi_strategy" } are not used.

Parameters: { "multi_strategy" } are not used.

Parameters: { "multi_strategy" } are not used.

Parameters: { "multi_strategy" } are not used.

Parameters: { "multi_strategy" } are not used.

Parameters: { "multi_strategy" } are not

# Random Forest
Ejecutar las celdas de Xgboost

In [8]:
from sklearn.ensemble import RandomForestClassifier
from functions.Experimentations import ParamSearch

max_levels = [300]

parameters = {'max_depth': max_levels}
rndforest = RandomForestClassifier(random_state=0)
fileName = r'../data/results/E5/{}_{}_results.csv'.format(dataset_name, rndforest.__class__.__name__)

clf_acplt = ParamSearch(rndforest, parameters)
clf_acplt.fit(data_fill[:, :cfg.params.vector_length.sentence_embedding], y_label, fileName)


<Experimentations.ParamSearch at 0x1a4561ff3a0>

# KNN

In [10]:
from functions.Experimentations import ParamSearch
from sklearn.neighbors import KNeighborsClassifier

parameters = {'n_neighbors': np.arange(1, 10)}
knn = KNeighborsClassifier()
fileName = r'../data/results/E5/{}_{}_results.csv'.format(dataset_name, knn.__class__.__name__)

clf_acplt = ParamSearch(knn, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.sentence_embedding], data_matrix[:, cfg.params.vector_length.sentence_embedding+2], fileName)



<Experimentations.ParamSearch at 0x23285fb2c50>