In [1]:
import pandas as pd
import numpy as np
import spacy
import gensim
import random
import time

from sentence_transformers import SentenceTransformer

# Set of the random seed for the kmeans model
random.seed(0)

# Global Variables 

In [None]:
vector_length = 768

# Data Import

In [8]:
# Data import
# Relative Path of the dataset, change for your dataset
dataset_name = "cpn27"
file_path = '../data/raw_dataset/normalize/'

if dataset_name == "cpn27":
    # data = pd.read_csv(file_path+'CPN27_lemma.csv', delimiter=",")
    data = pd.read_csv(file_path+'CPN27_normalize.csv', delimiter=",")
    
elif dataset_name == "cpn120":
    data = pd.read_csv(file_path+'CPN120_normalize.csv', delimiter=",")

# else:
#     data = pd.read_csv(r'your-path/your-file.csv', delimiter=",")

data.fillna(value='')

Unnamed: 0,Palabra (concepto),Descripción,Codificación
0,Compasión,sentimiento,sentimiento
1,Compasión,por lo que se le perdona la vida a alguien en ...,perdón
2,plan,necesario para cumplir objetivos,objetivos
3,plan,organizar recursos o personas,organización
4,plan,estrategia,estrategia
...,...,...,...
4933,Obligación,imposicion,imposición
4934,Obligación,carácter extricto,extricto
4935,Obligación,normas,normas
4936,Obligación,deberes,deber


# Enviroment Configuration

In [9]:
model = SentenceTransformer('dccuchile/bert-base-spanish-wwm-cased')

nlp = spacy.load(cfg.enviroment.nlp)

# Word Embedding

In [10]:
# Timer
start = time.time()

descriptions_matrix = np.zeros( # creatign an empty matrix
    (
        len(data),          # the number of data points
        vector_length       # the number of components of the word embedding
    )
)

# Matrix filling 
# Change to the name of the descriptions of your dataset.
for i,description in enumerate(data.iloc[:,1]):
    vector = model.encode(description)
    descriptions_matrix[i,] = vector

# Concatenate the matrix with the data of each observation
data_matrix = np.concatenate([descriptions_matrix,data], axis=1)


# Remove of the 'Nan' data
data_matrix = data_matrix[~pd.isnull(data_matrix[:,:vector_length]).any(axis=1)]


end = time.time()
print("Exceution time:", end-start)

Execution time: 0.4028456211090088


# AC-PLT

In [None]:
from Experimentations import ParamSearch
from AC_PLT import AC_PLT

parameters = {'n_clusters': np.arange(50, 1800, 50)}
ac_plt = AC_PLT()

file_name = r'../data/results/Word2Vec/lemmatize/{}_{}_results.csv'.format(dataset_name, ac_plt.__class__.__name__)

clf_acplt = ParamSearch(ac_plt, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length], data_matrix[:, cfg.params.vector_length+2],file_name)

# Naïve Bayes

In [11]:
from Experimentations import ParamSearch
from sklearn.naive_bayes import GaussianNB

# parameters = {'var_smoothing': np.geomspace(1e-06, 1e+02, num=9)}
parameters = {'var_smoothing': (20, 30, 40, 50, 60, 70, 80, 90)}
proc = GaussianNB()

file_name = r'../data/results/Word2Vec/{}_{}_results.csv'.format(dataset_name, proc.__class__.__name__)

clf = ParamSearch(proc, parameters)
clf.fit(data_matrix[:, :cfg.params.vector_length], data_matrix[:, cfg.params.vector_length+2], file_name)



<Experimentations.ParamSearch at 0x2430bb2ded0>

In [11]:
clf.cv_results_

{'params': [{'var_smoothing': 1e-06},
  {'var_smoothing': 1e-05},
  {'var_smoothing': 0.0001},
  {'var_smoothing': 0.001},
  {'var_smoothing': 0.01},
  {'var_smoothing': 0.1},
  {'var_smoothing': 1.0},
  {'var_smoothing': 10.0},
  {'var_smoothing': 100.0}],
 'train_mean': array([0.7487839 , 0.74881529, 0.74887805, 0.74922327, 0.75047861,
        0.74256215, 0.62481169, 0.60896311, 0.11165424]),
 'train_std': array([0.00147615, 0.00147441, 0.0014603 , 0.00143501, 0.00152015,
        0.00134227, 0.00228291, 0.00141481, 0.0018167 ]),
 'test_mean': array([0.606955  , 0.60698638, 0.60701776, 0.60745711, 0.61100331,
        0.61878623, 0.54017083, 0.53766016, 0.10968489]),
 'test_std': array([0.0112798 , 0.01127949, 0.01107009, 0.01066549, 0.00886624,
        0.0093967 , 0.00901088, 0.00739244, 0.00414743])}

In [12]:
clf.to_csv(r'../data/results/Word2Vec/{}_{}_results.csv'.format(dataset_name, proc.__class__.__name__))

<Experimentations.ParamSearch at 0x2cbebb4d360>

# Support Vector Classifier

In [8]:
from Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='rbf'
parameters = {'C': np.geomspace(1e-05, 1e+01, num=7)}
svc_rbf = SVC(kernel=kernel, gamma='auto')
fileName = r'../data/results/Word2Vec/{}_{}{}_results.csv'.format(dataset_name, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length], data_matrix[:, cfg.params.vector_length+2], fileName)



<Experimentations.ParamSearch at 0x2443c1217e0>

In [9]:
from Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='rbf'
parameters = {'C': np.geomspace(1e01, 1e+04, num=4)}
svc_rbf = SVC(kernel=kernel, gamma='auto')
fileName = r'../data/results/Word2Vec/{}_{}{}_results.csv'.format(dataset_name, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length], data_matrix[:, cfg.params.vector_length+2], fileName)



<Experimentations.ParamSearch at 0x24446d72a40>

In [14]:
from Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='linear'
parameters = {'C': np.geomspace(1e-05, 1e+01, num=7)}
svc_rbf = SVC(kernel=kernel, gamma='auto')

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length], data_matrix[:, cfg.params.vector_length+2])

clf_acplt.to_csv(r'../data/results/Word2Vec/{}_{}{}_results.csv'.format(dataset_name, svc_rbf.__class__.__name__, kernel))



<Experimentations.ParamSearch at 0x2cbebb4fa30>

In [10]:
from Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='poly'
parameters = {'degree': np.arange(1,7)}
svc_rbf = SVC(kernel=kernel, gamma='auto')
fileName = r'../data/results/Word2Vec/{}_{}{}_results.csv'.format(dataset_name, svc_rbf.__class__.__name__, kernel)


clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length], data_matrix[:, cfg.params.vector_length+2], fileName)





<Experimentations.ParamSearch at 0x24446d723b0>

In [None]:
from Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='sigmoid'
parameters = {'C': np.geomspace(1e-05, 1e+01, num=7)}
svc_rbf = SVC(kernel=kernel, gamma='auto')
fileName = r'../data/results/Word2Vec/{}_{}{}_results.csv'.format(dataset_name, svc_rbf.__class__.__name__, kernel)


clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length], data_matrix[:, cfg.params.vector_length+2], fileName)

# Decision Trees

In [13]:
from Experimentations import ParamSearch
from sklearn.tree import DecisionTreeClassifier

criterion = 'gini'
parameters = {'max_leaf_nodes': np.arange(100, 1501, 100)}
desition_tree = DecisionTreeClassifier(criterion=criterion)
fileName = r'../data/results/Word2Vec/{}_{}_{}_results.csv'.format(dataset_name, desition_tree.__class__.__name__, criterion)

clf_acplt = ParamSearch(desition_tree, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length], data_matrix[:, cfg.params.vector_length+2], fileName)

# clf_acplt.to_csv()



<Experimentations.ParamSearch at 0x20abd22bd00>

# XGBoost

In [7]:
cod = pd.DataFrame(data_matrix).iloc[:,cfg.params.vector_length+2].value_counts()
reduce_cod = cod[cod<5]
n=5
data_fill = pd.DataFrame(data_matrix).copy()

for key, value in reduce_cod.items():
    m=np.abs(n-value)
    nrow = np.zeros(cfg.params.vector_length)
    nrow = np.concatenate([nrow, np.array(['', '', key])])
    for i in range(m): data_fill = np.vstack([data_fill,nrow])

In [8]:
y = data_fill[:, cfg.params.vector_length+2]

labels = np.unique(y)
i=0
idx2class = {}
class2idx= {}
for tp in labels:
    idx2class[i] = tp
    class2idx[tp] = i
    i += 1

y_label = np.vectorize(class2idx.get)(y)

In [9]:
from xgboost import XGBClassifier
from Experimentations import ParamSearch

max_levels = [5,10,50,100,200,300]

parameters = {'max_depth': max_levels, 'n_estimators': np.arange(1, 20, 1)}

bst = XGBClassifier(learning_rate=1, objective='multi:softprob', random_state=0)
fileName = r'../data/results/Word2Vec/{}_{}_results.csv'.format(dataset_name, bst.__class__.__name__)

clf_acplt = ParamSearch(bst, parameters)
clf_acplt.fit(data_fill[:, :cfg.params.vector_length], y_label, fileName)


# Random Forest
Ejecutar las celdas de Xgboost

In [None]:
from sklearn.ensemble import RandomForestClassifier
from Experimentations import ParamSearch

max_levels = [5,10,50,100,200,300]

parameters = {'max_depht': max_levels}
rndforest = RandomForestClassifier(random_state=0)
fileName = r'../data/results/Word2Vec/{}_{}_results.csv'.format(dataset_name, rndforest.__class__.__name__)

clf_acplt = ParamSearch(rndforest, parameters)
clf_acplt.fit(data_fill[:, :cfg.params.vector_length], y_label, fileName)


# KNN

In [None]:
from Experimentations import ParamSearch
from sklearn.neighbors import KNeighborsClassifier

parameters = {'n_neighbors': np.arange(10, 501, 10)}
knn = KNeighborsClassifier()
fileName = r'../data/results/Word2Vec/{}_{}_results.csv'.format(dataset_name, knn.__class__.__name__)

clf_acplt = ParamSearch(knn, parameters)
clf_acplt.fit(data.iloc[:, :cfg.params.vector_length].to_numpy(), data.iloc[:, cfg.params.vector_length+2].to_numpy(), fileName)