In [1]:
import pandas as pd
import numpy as np
import spacy
import gensim
import random
import time
import sys
import os

sys.path.append(os.path.join("..", ".."))

# Created functions
from functions.Vectorize import to_vector

# Set of the random seed for the kmeans model
random.seed(0)

# Global Variables 

In [2]:
cfg = {
    "params":{
        "kfold": 5,
        "kmeans":{
            "n_cluster":{
                "cpn27": 550,
                "cpn120": 1750
            }
        },
        "vector_length":{
            "word2vec": 300,
            "sentence_embedding": 768
        },
        "random_state": 0
    }
}

# Data Import

In [3]:
# Data import
# Relative Path of the dataset, change for your dataset

dataset_name = "cpn27"
# Options are "cpn27" and "cpn120"

type_standardization = "normalize_wo_stop" 
# options are "raw", "normalize", "normalize_wo_stop", and "lemmatize", "lemmatize_wo_stop"

# import of the data
data = pd.read_csv(, delimiter=",")
data = data.fillna(value='')

# Enviroment Configuration

In [4]:
# Import of the model of the spanish billion words embeddings
model = gensim.models.KeyedVectors.load_word2vec_format( 
    cfg.path_embedding.word2vec, # Relative path of the vector
    binary=True # The model is in binary format
) 

nlp = spacy.load(cfg.enviroment.nlp)

# Word Embedding

In [5]:
# Timer
start = time.time()

# Creation of a matrix full of 0 to save the vectors of each feature
descriptions_matrix = np.zeros( 
    (
        data.shape[0], # the number of data points
        cfg.params.vector_length.word2vec  # the number of components of the word embedding
    )
)

# Matrix filling with the vectors of each point
for i,description in enumerate(data.iloc[:,1]):
    descriptions_matrix[i,] = to_vector(description, model, cfg.params.vector_length.word2vec)

# Concatenate the matrix with the data of each observation
data_matrix = np.concatenate([descriptions_matrix,data], axis=1)

# Remove of the 'Nan' values in the data
data_matrix = data_matrix[~pd.isnull(data_matrix[:,:cfg.params.vector_length.word2vec]).any(axis=1)]


end = time.time()
print("Execution time:", end-start)

Execution time: 0.25395774841308594


# AC-PLT

In [6]:
from functions.Experimentations import ParamSearch
from functions.AC_PLT import AC_PLT

parameters = {'n_clusters': np.arange(50, 1800, 50)}
ac_plt = AC_PLT()

file_name = cfg.path_results.Word2Vec+r'{}_{}_{}_results.csv'.format(dataset_name, type_standardization, ac_plt.__class__.__name__)

clf_acplt = ParamSearch(ac_plt, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.word2vec], data_matrix[:, cfg.params.vector_length.word2vec+2],file_name)



<functions.Experimentations.ParamSearch at 0x1c87f8461d0>

# Naïve Bayes

In [6]:
from functions.Experimentations import ParamSearch
from sklearn.naive_bayes import GaussianNB

# parameters = {'var_smoothing': np.geomspace(1e-06, 1e+02, num=9)}
parameters = {'var_smoothing': (5, 10, 20, 30, 40, 50, 60, 70, 80, 90)}
proc = GaussianNB()

file_name = cfg.path_results.Word2Vec+r'{}_{}_{}_results.csv'.format(dataset_name, type_standardization, proc.__class__.__name__)

clf = ParamSearch(proc, parameters)
clf.fit(data_matrix[:, :cfg.params.vector_length.word2vec], data_matrix[:, cfg.params.vector_length.word2vec+2], file_name)



<functions.Experimentations.ParamSearch at 0x1bf62ef8d90>

# Support Vector Classifier

In [7]:
from functions.Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='rbf'
parameters = {'C': np.geomspace(1e-03, 1e+01, num=6)}
svc_rbf = SVC(kernel=kernel, gamma='auto')
fileName = cfg.path_results.Word2Vec+r'{}_{}_{}_{}_results.csv'.format(dataset_name, type_standardization, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.word2vec], data_matrix[:, cfg.params.vector_length.word2vec+2], fileName)



<functions.Experimentations.ParamSearch at 0x1bfb48d9b40>

In [8]:
from functions.Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='rbf'
parameters = {'C': np.geomspace(1e02, 1e+04, num=4)}
svc_rbf = SVC(kernel=kernel, gamma='auto')
fileName = cfg.path_results.Word2Vec+r'{}_{}_{}_{}_results.csv'.format(dataset_name, type_standardization, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.word2vec], data_matrix[:, cfg.params.vector_length.word2vec+2], fileName)



<functions.Experimentations.ParamSearch at 0x1bfb99918d0>

In [6]:
from functions.Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='linear'
# parameters = {'C': np.geomspace(1e-05, 1e+01, num=7)}
parameters = {'C': [1, 2, 3, 4, 5, 6]}
svc_rbf = SVC(kernel=kernel, gamma='auto')

fileName = cfg.path_results.Word2Vec+r'{}_{}_{}_{}_results.csv'.format(dataset_name, type_standardization, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.word2vec], data_matrix[:, cfg.params.vector_length.word2vec+2], fileName)

# clf_acplt.to_csv()



<functions.Experimentations.ParamSearch at 0x17dffbf2c50>

In [7]:
from functions.Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='poly'
parameters = {'degree': np.arange(1,7)}
svc_rbf = SVC(kernel=kernel, gamma='auto')
fileName = cfg.path_results.Word2Vec+r'{}_{}_{}_{}_results.csv'.format(dataset_name, type_standardization, svc_rbf.__class__.__name__, kernel)


clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.word2vec], data_matrix[:, cfg.params.vector_length.word2vec+2], fileName)



<functions.Experimentations.ParamSearch at 0x17dffc571f0>

In [8]:
from functions.Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='sigmoid'
parameters = {'C': np.geomspace(1e-05, 1e+01, num=7)}
svc_rbf = SVC(kernel=kernel, gamma='auto')
fileName = cfg.path_results.Word2Vec+r'{}_{}_{}_{}_results.csv'.format(dataset_name, type_standardization, svc_rbf.__class__.__name__, kernel)


clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.word2vec], data_matrix[:, cfg.params.vector_length.word2vec+2], fileName)



<functions.Experimentations.ParamSearch at 0x17dc61158a0>

# Decision Trees

In [None]:
from functions.Experimentations import ParamSearch
from sklearn.tree import DecisionTreeClassifier

criterion = 'gini'
parameters = {'max_leaf_nodes': np.arange(100, 1001, 100)}
desition_tree = DecisionTreeClassifier(criterion=criterion)
fileName = cfg.path_results.Word2Vec+r'{}_{}_{}_{}_results.csv'.format(dataset_name, type_standardization, desition_tree.__class__.__name__, criterion)

clf_acplt = ParamSearch(desition_tree, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.word2vec], data_matrix[:, cfg.params.vector_length.word2vec+2], fileName)

# clf_acplt.to_csv()



# XGBoost

In [7]:
cod = pd.DataFrame(data_matrix).iloc[:,cfg.params.vector_length+2].value_counts()
reduce_cod = cod[cod<5]
n=5
data_fill = pd.DataFrame(data_matrix).copy()

for key, value in reduce_cod.items():
    m=np.abs(n-value)
    nrow = np.zeros(cfg.params.vector_length)
    nrow = np.concatenate([nrow, np.array(['', '', key])])
    for i in range(m): data_fill = np.vstack([data_fill,nrow])

In [8]:
y = data_fill[:, cfg.params.vector_length+2]

labels = np.unique(y)
i=0
idx2class = {}
class2idx= {}
for tp in labels:
    idx2class[i] = tp
    class2idx[tp] = i
    i += 1

y_label = np.vectorize(class2idx.get)(y)

In [9]:
from xgboost import XGBClassifier
from functions.Experimentations import ParamSearch

max_levels = [5,10,50,100]

parameters = {'max_depth': max_levels, 'n_estimators': np.arange(1, 16, 1)}

bst = XGBClassifier(learning_rate=1, objective='multi:softprob', random_state=0)
fileName = cfg.path_results.Word2Vec+r'{}_{}_{}_results.csv'.format(dataset_name, type_standardization, bst.__class__.__name__)

clf_acplt = ParamSearch(bst, parameters)
clf_acplt.fit(data_fill[:, :cfg.params.vector_length.word2vec], y_label, fileName)


# Random Forest
Ejecutar las celdas de Xgboost

In [None]:
from sklearn.ensemble import RandomForestClassifier
from functions.Experimentations import ParamSearch

max_levels = [5,10,50,100]

parameters = {'max_depth': max_levels}
rndforest = RandomForestClassifier(random_state=0)
fileName = cfg.path_results.Word2Vec+r'{}_{}_{}_results.csv'.format(dataset_name, type_standardization, rndforest.__class__.__name__)

clf_acplt = ParamSearch(rndforest, parameters)
clf_acplt.fit(data_fill[:, :cfg.params.vector_length.word2vec], y_label, fileName)


# KNN

In [None]:
from functions.Experimentations import ParamSearch
from sklearn.neighbors import KNeighborsClassifier

parameters = {'n_neighbors': np.arange(10, 501, 10)}
knn = KNeighborsClassifier()
fileName = cfg.path_results.Word2Vec+r'{}_{}_{}_results.csv'.format(dataset_name, type_standardization, knn.__class__.__name__)

clf_acplt = ParamSearch(knn, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.word2vec], data_matrix[:, cfg.params.vector_length.word2vec+2], fileName)