In [1]:
import pandas as pd
import numpy as np
import spacy
import gensim
from collections import Counter 
import sklearn.cluster
import random
from scipy.spatial import distance
import time

# Created functions
from text_cleaning import TextCleaner
from Vectorize import to_vector

# Set of the random seed for the kmeans model
random.seed(0)

# Global Variables 

In [2]:
from hydra import initialize, compose

with initialize(version_base=None, config_path="conf"):
    cfg=compose(config_name="main.yaml")

# Data Import

In [3]:
# Data import
# Relative Path of the dataset, change for your dataset
dataset_name = "cpn27"

if dataset_name == "cpn27":
    data = pd.read_csv(cfg.path_data.cpn27, delimiter=",")
elif dataset_name == "cpn120":
    data = pd.read_csv(cfg.path_data.cpn120, delimiter=",")
# else:
#     data = pd.read_csv(r'your-path/your-file.csv', delimiter=",")


# Enviroment Configuration

In [4]:
# Import of the model of the spanish billion words embeddings
model = gensim.models.KeyedVectors.load_word2vec_format( 
    cfg.path_embedding.word2vec, # Relative path of the vector
    binary=True # The model is in binary format
) 

nlp = spacy.load(cfg.enviroment.nlp)

# Text Cleaning

In [5]:
Cln = TextCleaner(cfg.enviroment.nlp, cfg.enviroment.language)

In [8]:
# Function to apply to the dataset, change between "normalize", "lemmatize" or "stemming"
func = Cln.normalize

data_copy = data.copy()


start = time.time()
data_copy.iloc[:, 1] = data_copy.apply(lambda row: func(row.iloc[1]), axis=1)
end = time.time()

print("Execution time:", end-start)

Execution time: 25.97251796722412


# Word Embedding

In [9]:
# Timer
start = time.time()

# Creation of a matrix full of 0 to save the vectors of each feature
descriptions_matrix = np.zeros( 
    (
        data_copy.shape[0], # the number of data points
        cfg.params.vector_lenght  # the number of components of the word embedding
    )
)

# Matrix filling with the vectors of each point
for i,description in enumerate(data_copy.iloc[:,1]):
    descriptions_matrix[i,] = to_vector(description, model, cfg.params.vector_lenght)

# Concatenate the matrix with the data of each observation
data_matrix = np.concatenate([descriptions_matrix,data_copy], axis=1)

# Remove of the 'Nan' values in the data
data_matrix = data_matrix[~pd.isnull(data_matrix[:,:cfg.params.vector_lenght]).any(axis=1)]


end = time.time()
print("Execution time:", end-start)

Execution time: 1.5281310081481934


In [10]:
from sklearn.model_selection import GridSearchCV
from AC_PLT import AC_PLT
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB

kfold=KFold(n_splits=5)

parameters = {'var_smoothing': (1e-9, 1)}

proc = GaussianNB()

clf = GridSearchCV(proc, parameters, scoring="accuracy")
clf.fit(data_matrix[:, :cfg.params.vector_lenght], data_matrix[:, cfg.params.vector_lenght+2])




In [11]:
clf.cv_results_

{'mean_fit_time': array([0.22402277, 0.19475689]),
 'std_fit_time': array([0.04001708, 0.01895391]),
 'mean_score_time': array([3.70652342, 3.25993004]),
 'std_score_time': array([0.32186439, 0.12519547]),
 'param_var_smoothing': masked_array(data=[1e-09, 1],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'var_smoothing': 1e-09}, {'var_smoothing': 1}],
 'split0_test_score': array([0.38461538, 0.42206478]),
 'split1_test_score': array([0.41396761, 0.40283401]),
 'split2_test_score': array([0.40182186, 0.42510121]),
 'split3_test_score': array([0.41641337, 0.43465046]),
 'split4_test_score': array([0.408308  , 0.42755826]),
 'mean_test_score': array([0.40502525, 0.42244174]),
 'std_test_score': array([0.01137766, 0.01064809]),
 'rank_test_score': array([2, 1])}