In [1]:
import pandas as pd
import numpy as np
import random
import time
import sys
import os

sys.path.append(os.path.join("..", ".."))

# Word Embedding
from sentence_transformers import SentenceTransformer

# Created functions
from functions.Experimentations import ParamSearch
random.seed(0)

  from .autonotebook import tqdm as notebook_tqdm


# Variables 

In [2]:
from hydra import initialize, compose

with initialize(version_base=None, config_path="../../conf"):
    cfg=compose(config_name="main.yaml")

# Data Import

In [3]:
# Data import
# Relative Path of the dataset, change for your dataset

dataset_name = "Naturaleza"
# Options are "cpn27" and "cpn120" and "Democracia" and "Naturaleza"

type_standardization = "lemmatize" 
# options are "raw", "normalize", "normalize_wo_stop", and "lemmatize"

# import of the data
data = pd.read_csv(f'../../../data/raw_dataset/{type_standardization}/{dataset_name}_{type_standardization}.csv', delimiter=",")
data = data.fillna(value='')

# Model

In [4]:
# model = SentenceTransformer('dccuchile/bert-base-spanish-wwm-cased')

model = SentenceTransformer('google-bert/bert-base-uncased')

No sentence-transformers model found with name C:\Users\dra98/.cache\torch\sentence_transformers\google-bert_bert-base-uncased. Creating a new one with MEAN pooling.


In [5]:
# Timer
start = time.time()

descriptions_matrix = np.zeros( # creatign an empty matrix
    (
        len(data),                                       # the number of data points
        cfg.params.vector_length.sentence_embedding      # the number of components of the word embedding
    )
)

# Matrix filling 
# Change to the name of the descriptions of your dataset.
for i,description in enumerate(data.iloc[:,1]):
    vector = model.encode(description)
    descriptions_matrix[i,] = vector

# Concatenate the matrix with the data of each observation
data_matrix = np.concatenate([descriptions_matrix,data], axis=1)


# Remove of the 'Nan' data
data_matrix = data_matrix[~pd.isnull(data_matrix[:,:cfg.params.vector_length.sentence_embedding]).any(axis=1)]


end = time.time()
print("Exceution time:", end-start)

Exceution time: 396.0777778625488


# AC-PLT

In [None]:
from functions.AC_PLT import AC_PLT

parameters = {'n_clusters': np.arange(50, 1500, 50)}
ac_plt = AC_PLT()

file_name = cfg.path_results.BETO+r'{}_{}_{}_results.csv'.format(dataset_name, type_standardization, ac_plt.__class__.__name__)

clf_acplt = ParamSearch(ac_plt, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.sentence_embedding], data_matrix[:, cfg.params.vector_length.sentence_embedding+2],file_name)

 'dangerous' 'size' 'life essential' 'conservation' 'seasonal change'
 'life essential' 'extreme' 'clean' 'breeze' 'life' 'pleasant' 'water'
 'seasonal change' 'ecological role' 'wind' 'breeze' 'habitat'
 'biodiversity' 'flight' 'clean' 'clean' 'clean' 'ecological role'
 'food source' 'ecological role' 'seasonal change' 'flight' 'conservation'
 'size' 'chlorophyll' 'climate influence' 'water' 'wind' 'natural' 'trees'
 'water' 'photosynthesis' 'climate influence' 'food source' 'food source'
 'wind' 'wind' 'photosynthesis' 'natural' 'photosynthesis' 'dangerous'
 'natural' 'seasonal change' 'chlorophyll' 'climate influence' 'feathers'
 'dangerous' 'edible' 'life' 'life essential' 'dangerous' 'life essential'
 'green' 'dangerous' 'green' 'dangerous' 'biodiversity' 'biodiversity'
 'clean' 'pleasant' 'natural' 'size' 'dangerous' 'dangerous'
 'life essential' 'natural' 'natural' 'beauty' 'pleasant' 'biodiversity'
 'ecological role' 'biodiversity' 'biodiversity' 'ecological role' 'life'
 'dang

# Naive Bayes

In [None]:
from Experimentations import ParamSearch
from sklearn.naive_bayes import GaussianNB

parameters = {'var_smoothing': np.geomspace(1e-06, 1e+02, num=9)}
proc = GaussianNB()

file_name = r'{}_{}_results.csv'.format(dataset_name, proc.__class__.__name__)

clf = ParamSearch(proc, parameters)
clf.fit(data_matrix[:, :cfg.params.vector_length.sentence_embedding], data_matrix[:, cfg.params.vector_length.sentence_embedding+2],file_name)

# SVC

In [None]:
from Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='rbf'
parameters = {'C': np.geomspace(1e-05, 1e+01, num=7)}
svc_rbf = SVC(kernel=kernel, gamma='auto')
fileName = r'{}_{}{}_results.csv'.format(dataset_name, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.sentence_embedding], data_matrix[:, cfg.params.vector_length.sentence_embedding+2], fileName)

In [None]:
from Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='linear'
parameters = {'C': np.geomspace(1e-05, 1e+01, num=7)}
svc_linear = SVC(kernel=kernel, gamma='auto')
fileName = r'{}_{}{}_results.csv'.format(dataset_name, svc_linear.__class__.__name__, kernel)


clf_acplt = ParamSearch(svc_linear, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.sentence_embedding], data_matrix[:, cfg.params.vector_length.sentence_embedding+2],fileName)

In [None]:
from Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='poly'
parameters = {'degree': np.arange(1,7)}
svc_poly = SVC(kernel=kernel, gamma='auto')
fileName = r'{}_{}{}_results.csv'.format(dataset_name, svc_poly.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_poly, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.sentence_embedding], data_matrix[:, cfg.params.vector_length.sentence_embedding+2], fileName)

In [None]:
from Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='sigmoid'
parameters = {'C': np.geomspace(1e-05, 1e+01, num=7)}
svc_sigmoid = SVC(kernel=kernel, gamma='auto')
fileName = r'{}_{}{}_results.csv'.format(dataset_name, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_sigmoid, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.sentence_embedding], data_matrix[:, cfg.params.vector_length.sentence_embedding+2], fileName)

# Decision Tree

In [None]:
from Experimentations import ParamSearch
from sklearn.tree import DecisionTreeClassifier

criterion = 'gini'
parameters = {'max_leaf_nodes': np.arange(100, 1501, 100)}
desition_tree = DecisionTreeClassifier(criterion=criterion)
fileName = r'{}_{}_{}_results.csv'.format(dataset_name, desition_tree.__class__.__name__, criterion)

clf_acplt = ParamSearch(desition_tree, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length.sentence_embedding], data_matrix[:, cfg.params.vector_length.sentence_embedding+2], fileName)

# XGBoost

In [None]:
cod = data.iloc[:,cfg.params.vector_length.sentence_embedding+2].value_counts()
reduce_cod = cod[cod<5]
n=5
data_fill = data.copy()

for key, value in reduce_cod.items():
    m=np.abs(n-value)
    nrow = np.zeros(cfg.params.vector_length.sentence_embedding)
    nrow = np.concatenate([nrow, np.array(['', '', key])])
    for i in range(m): data_fill = np.vstack([data_fill,nrow])

In [None]:
y = data_fill[:, cfg.params.vector_length.sentence_embedding+2]

labels = np.unique(y)
i=0
idx2class = {}
class2idx= {}
for tp in labels:
    idx2class[i] = tp
    class2idx[tp] = i
    i += 1

y_label = np.vectorize(class2idx.get)(y)

In [None]:
from xgboost import XGBClassifier
from Experimentations import ParamSearch

max_levels = [5,10,50,100,200,300]

parameters = {'max_depht': max_levels, 'n_estimators': np.arange(1, 20, 1)}

bst = XGBClassifier(learning_rate=1, objective='multi:softprob', random_state=0)
fileName = r'{}_{}_results.csv'.format(dataset_name, bst.__class__.__name__)

clf_acplt = ParamSearch(bst, parameters)
clf_acplt.fit(data_fill[:, :cfg.params.vector_length.sentence_embedding], y_label, fileName)


# Random Forest
Ejecutar las celdas de Xgboost

In [None]:
from sklearn.ensemble import RandomForestClassifier
from Experimentations import ParamSearch

max_levels = [5,10,50,100,200,300]

parameters = {'max_depht': max_levels}
rndforest = RandomForestClassifier(random_state=0)
fileName = r'{}_{}_results.csv'.format(dataset_name, rndforest.__class__.__name__)

clf_acplt = ParamSearch(rndforest, parameters)
clf_acplt.fit(data_fill[:, :cfg.params.vector_length.sentence_embedding], y_label, fileName)


# KNN

In [None]:
from Experimentations import ParamSearch
from sklearn.neighbors import KNeighborsClassifier

parameters = {'n_neighbors': np.arange(10, 501, 10)}
knn = KNeighborsClassifier()
fileName = r'{}_{}_results.csv'.format(dataset_name, knn.__class__.__name__)

clf_acplt = ParamSearch(knn, parameters)
clf_acplt.fit(data.iloc[:, :cfg.params.vector_length.sentence_embedding].to_numpy(), data.iloc[:, cfg.params.vector_length.sentence_embedding+2].to_numpy(), fileName)