In [1]:
import pandas as pd
import numpy as np
import spacy
import gensim
import random
import time

# Created functions
from text_cleaning import TextCleaner
from Vectorize import to_vector

# Set of the random seed for the kmeans model
random.seed(0)

# Global Variables 

In [2]:
from hydra import initialize, compose

with initialize(version_base=None, config_path="conf"):
    cfg=compose(config_name="main.yaml")

# Data Import

In [3]:
# Data import
# Relative Path of the dataset, change for your dataset
dataset_name = "cpn120"

if dataset_name == "cpn27":
    data = pd.read_csv(cfg.path_data.cpn27, delimiter=",")
elif dataset_name == "cpn120":
    data = pd.read_csv(cfg.path_data.cpn120, delimiter=",")
# else:
#     data = pd.read_csv(r'your-path/your-file.csv', delimiter=",")


# Enviroment Configuration

In [4]:
# Import of the model of the spanish billion words embeddings
model = gensim.models.KeyedVectors.load_word2vec_format( 
    cfg.path_embedding.word2vec, # Relative path of the vector
    binary=True # The model is in binary format
) 

nlp = spacy.load(cfg.enviroment.nlp)

# Text Cleaning

In [5]:
Cln = TextCleaner(cfg.enviroment.nlp, cfg.enviroment.language)

In [6]:
# Function to apply to the dataset, change between "normalize", "lemmatize" or "stemming"

func = Cln.normalize

data_copy = data.copy()


start = time.time()
data_copy.iloc[:, 1] = data_copy.iloc[:,1].apply(func)
end = time.time()

print("Execution time:", end-start)

Execution time: 104.8928611278534


# Word Embedding

In [7]:
# Timer
start = time.time()

# Creation of a matrix full of 0 to save the vectors of each feature
descriptions_matrix = np.zeros( 
    (
        data_copy.shape[0], # the number of data points
        cfg.params.vector_length  # the number of components of the word embedding
    )
)

# Matrix filling with the vectors of each point
for i,description in enumerate(data_copy.iloc[:,1]):
    descriptions_matrix[i,] = to_vector(description, model, cfg.params.vector_length)

# Concatenate the matrix with the data of each observation
data_matrix = np.concatenate([descriptions_matrix,data_copy], axis=1)

# Remove of the 'Nan' values in the data
data_matrix = data_matrix[~pd.isnull(data_matrix[:,:cfg.params.vector_length]).any(axis=1)]


end = time.time()
print("Execution time:", end-start)

Execution time: 2.1804134845733643


# AC-PLT

In [1]:
from Experimentations import ParamSearch
from AC_PLT import AC_PLT

parameters = {'n_clusters': np.arange(50, 1800, 50)}
ac_plt = AC_PLT()

file_name = r'../data/results/Word2Vec/{}_{}_results.csv'.format(dataset_name, ac_plt.__class__.__name__)

clf_acplt = ParamSearch(ac_plt, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length], data_matrix[:, cfg.params.vector_length+2],file_name)

NameError: name 'np' is not defined

In [9]:
clf_acplt.to_csv()

<Experimentations.ParamSearch at 0x2cabc4fda80>

# Naïve Bayes

In [10]:
from Experimentations import ParamSearch
from sklearn.naive_bayes import GaussianNB

parameters = {'var_smoothing': np.geomspace(1e-06, 1e+02, num=9)}
proc = GaussianNB()

clf = ParamSearch(proc, parameters)
clf.fit(data_matrix[:, :cfg.params.vector_length], data_matrix[:, cfg.params.vector_length+2])





<Experimentations.ParamSearch at 0x2cbebb4d360>

In [11]:
clf.cv_results_

{'params': [{'var_smoothing': 1e-06},
  {'var_smoothing': 1e-05},
  {'var_smoothing': 0.0001},
  {'var_smoothing': 0.001},
  {'var_smoothing': 0.01},
  {'var_smoothing': 0.1},
  {'var_smoothing': 1.0},
  {'var_smoothing': 10.0},
  {'var_smoothing': 100.0}],
 'train_mean': array([0.7487839 , 0.74881529, 0.74887805, 0.74922327, 0.75047861,
        0.74256215, 0.62481169, 0.60896311, 0.11165424]),
 'train_std': array([0.00147615, 0.00147441, 0.0014603 , 0.00143501, 0.00152015,
        0.00134227, 0.00228291, 0.00141481, 0.0018167 ]),
 'test_mean': array([0.606955  , 0.60698638, 0.60701776, 0.60745711, 0.61100331,
        0.61878623, 0.54017083, 0.53766016, 0.10968489]),
 'test_std': array([0.0112798 , 0.01127949, 0.01107009, 0.01066549, 0.00886624,
        0.0093967 , 0.00901088, 0.00739244, 0.00414743])}

In [12]:
clf.to_csv(r'../data/results/Word2Vec/{}_{}_results.csv'.format(dataset_name, proc.__class__.__name__))

<Experimentations.ParamSearch at 0x2cbebb4d360>

# Support Vector Classifier

In [8]:
from Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='rbf'
parameters = {'C': np.geomspace(1e-05, 1e+01, num=7)}
svc_rbf = SVC(kernel=kernel, gamma='auto')
fileName = r'../data/results/Word2Vec/{}_{}{}_results.csv'.format(dataset_name, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length], data_matrix[:, cfg.params.vector_length+2], fileName)



<Experimentations.ParamSearch at 0x2443c1217e0>

In [9]:
from Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='rbf'
parameters = {'C': np.geomspace(1e01, 1e+04, num=4)}
svc_rbf = SVC(kernel=kernel, gamma='auto')
fileName = r'../data/results/Word2Vec/{}_{}{}_results.csv'.format(dataset_name, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length], data_matrix[:, cfg.params.vector_length+2], fileName)



<Experimentations.ParamSearch at 0x24446d72a40>

In [14]:
from Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='linear'
parameters = {'C': np.geomspace(1e-05, 1e+01, num=7)}
svc_rbf = SVC(kernel=kernel, gamma='auto')

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length], data_matrix[:, cfg.params.vector_length+2])

clf_acplt.to_csv(r'../data/results/Word2Vec/{}_{}{}_results.csv'.format(dataset_name, svc_rbf.__class__.__name__, kernel))



<Experimentations.ParamSearch at 0x2cbebb4fa30>

In [10]:
from Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='poly'
parameters = {'degree': np.arange(1,7)}
svc_rbf = SVC(kernel=kernel, gamma='auto')
fileName = r'../data/results/Word2Vec/{}_{}{}_results.csv'.format(dataset_name, svc_rbf.__class__.__name__, kernel)


clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length], data_matrix[:, cfg.params.vector_length+2], fileName)





<Experimentations.ParamSearch at 0x24446d723b0>

In [None]:
from Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='sigmoid'
parameters = {'C': np.geomspace(1e-05, 1e+01, num=7)}
svc_rbf = SVC(kernel=kernel, gamma='auto')
fileName = r'../data/results/Word2Vec/{}_{}{}_results.csv'.format(dataset_name, svc_rbf.__class__.__name__, kernel)


clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length], data_matrix[:, cfg.params.vector_length+2], fileName)

# Decision Trees

In [13]:
from Experimentations import ParamSearch
from sklearn.tree import DecisionTreeClassifier

criterion = 'gini'
parameters = {'max_leaf_nodes': np.arange(100, 1501, 100)}
desition_tree = DecisionTreeClassifier(criterion=criterion)
fileName = r'../data/results/Word2Vec/{}_{}_{}_results.csv'.format(dataset_name, desition_tree.__class__.__name__, criterion)

clf_acplt = ParamSearch(desition_tree, parameters)
clf_acplt.fit(data_matrix[:, :cfg.params.vector_length], data_matrix[:, cfg.params.vector_length+2], fileName)

# clf_acplt.to_csv()



<Experimentations.ParamSearch at 0x20abd22bd00>

# XGBoost

In [None]:
cod = data.iloc[:,vector_length+2].value_counts()
reduce_cod = cod[cod<5]
n=5
data_fill = data.copy()

for key, value in reduce_cod.items():
    m=np.abs(n-value)
    nrow = np.zeros(vector_length)
    nrow = np.concatenate([nrow, np.array(['', '', key])])
    for i in range(m): data_fill = np.vstack([data_fill,nrow])

In [None]:
y = data_fill[:, vector_length+2]

labels = np.unique(y)
i=0
idx2class = {}
class2idx= {}
for tp in labels:
    idx2class[i] = tp
    class2idx[tp] = i
    i += 1

In [None]:
y_label = np.vectorize(class2idx.get)(y)

In [None]:
max_leaves = [10,50,100,500,1000,1500,2000]

In [None]:
from xgboost import XGBClassifier
from Experimentations import ParamSearch

parameters = {'max_leaves': max_leaves, 'n_estimators': np.arange(1, 20, 1)}

bst = XGBClassifier(learning_rate=1, objective='multi:softprob', random_state=0)
fileName = r'{}_{}_results.csv'.format(dataset_name, bst.__class__.__name__)

clf_acplt = ParamSearch(bst, parameters)
clf_acplt.fit(data_fill[:, :vector_length], y_label, fileName)


# KNN

In [None]:
from Experimentations import ParamSearch
from sklearn.neighbors import KNeighborsClassifier

parameters = {'n_neighbors': np.arange(10, 501, 10)}
knn = KNeighborsClassifier()
fileName = r'{}_{}_results.csv'.format(dataset_name, knn.__class__.__name__)

clf_acplt = ParamSearch(knn, parameters)
clf_acplt.fit(data.iloc[:, :vector_length].to_numpy(), data.iloc[:, vector_length+2].to_numpy(), fileName)