In [1]:
import pandas as pd
import numpy as np
import spacy
import gensim
import random
import time

# Created functions
from text_cleaning import TextCleaner
from Vectorize import to_vector

# Set of the random seed for the kmeans model
random.seed(0)

# Global Variables 

In [2]:
from hydra import initialize, compose

with initialize(version_base=None, config_path="conf"):
    cfg=compose(config_name="main.yaml")

# Data Import

In [3]:
# Data import
# Relative Path of the dataset, change for your dataset
dataset_name = "cpn27"

if dataset_name == "cpn27":
    data = pd.read_csv(cfg.path_data.cpn27, delimiter=",")
elif dataset_name == "cpn120":
    data = pd.read_csv(cfg.path_data.cpn120, delimiter=",")
# else:
#     data = pd.read_csv(r'your-path/your-file.csv', delimiter=",")


# Enviroment Configuration

In [4]:
# Import of the model of the spanish billion words embeddings
model = gensim.models.KeyedVectors.load_word2vec_format( 
    cfg.path_embedding.word2vec, # Relative path of the vector
    binary=True # The model is in binary format
) 

nlp = spacy.load(cfg.enviroment.nlp)

# Text Cleaning

In [5]:
Cln = TextCleaner(cfg.enviroment.nlp, cfg.enviroment.language)

In [6]:
# Function to apply to the dataset, change between "normalize", "lemmatize" or "stemming"

func = Cln.normalize

data_copy = data.copy()


start = time.time()
data_copy.iloc[:, 1] = data_copy.iloc[:,1].apply(func)
end = time.time()

print("Execution time:", end-start)

Execution time: 17.552050590515137


# Word Embedding

In [7]:
# Timer
start = time.time()

# Creation of a matrix full of 0 to save the vectors of each feature
descriptions_matrix = np.zeros( 
    (
        data_copy.shape[0], # the number of data points
        cfg.params.vector_lenght  # the number of components of the word embedding
    )
)

# Matrix filling with the vectors of each point
for i,description in enumerate(data_copy.iloc[:,1]):
    descriptions_matrix[i,] = to_vector(description, model, cfg.params.vector_lenght)

# Concatenate the matrix with the data of each observation
data_matrix = np.concatenate([descriptions_matrix,data_copy], axis=1)

# Remove of the 'Nan' values in the data
data_matrix = data_matrix[~pd.isnull(data_matrix[:,:cfg.params.vector_lenght]).any(axis=1)]


end = time.time()
print("Execution time:", end-start)

Execution time: 0.5040445327758789


In [8]:
# from sklearn.model_selection import GridSearchCV
from Experimentations import ParamSearch
# from AC_PLT import AC_PLT
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB

kfold=KFold(n_splits=5)

parameters = {'var_smoothing': np.geomspace(1e-06, 1e+02, num=9)}
proc = GaussianNB()

clf = ParamSearch(proc, parameters)
clf.fit(data_matrix[:, :cfg.params.vector_lenght], data_matrix[:, cfg.params.vector_lenght+2])



<Experimentations.ParamSearch at 0x173c066f8e0>

In [9]:
clf.cv_results_

{'param_grid': [{'var_smoothing': 1e-09}, {'var_smoothing': 1}],
 'train_mean': array([0.87925235, 0.72691304]),
 'train_std': array([0.00527944, 0.00822296]),
 'test_mean': array([0.40502525, 0.42244174]),
 'test_std': array([0.01137766, 0.01064809])}