In [2]:
import pandas as pd
import numpy as np
import spacy
import gensim
from collections import Counter 
import sklearn.cluster
import random
from scipy.spatial import distance
import time

# Created functions
from text_cleaning import TextCleaner
from Vectorize import to_vector

# Set of the random seed for the kmeans model
random.seed(0)

# Global Variables 

In [3]:
from hydra import initialize, compose

with initialize(version_base=None, config_path="conf"):
    cfg=compose(config_name="main.yaml")

# Data Import

In [4]:
# Data import
# Relative Path of the dataset, change for your dataset
dataset_name = "cpn27"

if dataset_name == "cpn27":
    data = pd.read_csv(cfg.path_data.cpn27, delimiter=",")
elif dataset_name == "cpn120":
    data = pd.read_csv(cfg.path_data.cpn120, delimiter=",")
# else:
#     data = pd.read_csv(r'your-path/your-file.csv', delimiter=",")


# Enviroment Configuration

In [5]:
# Import of the model of the spanish billion words embeddings
model = gensim.models.KeyedVectors.load_word2vec_format( 
    cfg.path_embedding.word2vec, # Relative path of the vector
    binary=True # The model is in binary format
) 

nlp = spacy.load(cfg.enviroment.nlp)

# Text Cleaning

In [6]:
Cln = TextCleaner(cfg.enviroment.nlp, cfg.enviroment.language)

In [7]:
# Function to apply to the dataset, change between "normalize", "lemmatize" or "stemming"
func = Cln.normalize

data_copy = data.copy()


start = time.time()
data_copy.iloc[:, 1] = data_copy.apply(lambda row: func(row.iloc[1]), axis=1)
end = time.time()

# print("Execution time:", end-start)

# Word Embedding

In [8]:
# Timer
start = time.time()

# Creation of a matrix full of 0 to save the vectors of each feature
descriptions_matrix = np.zeros( 
    (
        data_copy.shape[0], # the number of data points
        cfg.params.vector_lenght  # the number of components of the word embedding
    )
)

# Matrix filling with the vectors of each point
for i,description in enumerate(data_copy.iloc[:,1]):
    descriptions_matrix[i,] = to_vector(description, model, cfg.params.vector_lenght)

# Concatenate the matrix with the data of each observation
data_matrix = np.concatenate([descriptions_matrix,data_copy], axis=1)

# Remove of the 'Nan' values in the data
data_matrix = data_matrix[~pd.isnull(data_matrix[:,:cfg.params.vector_lenght]).any(axis=1)]


end = time.time()
print("Execution time:", end-start)

Execution time: 0.5346596240997314


In [9]:
from sklearn.model_selection import GridSearchCV
from AC_PLT import AC_PLT
from sklearn.model_selection import KFold

kfold=KFold(n_splits=5)

parameters = {'n_clusters':[500, 1000]}

k500_acc_top_train = np.zeros(5)
k500_acc_top_test = np.zeros(5)

i=0
for train_index, test_index in kfold.split(data_matrix):
    train_X = data_matrix[train_index, :cfg.params.vector_lenght]
    train_y = data_matrix[train_index, cfg.params.vector_lenght+2]

    test_X = data_matrix[test_index, :cfg.params.vector_lenght]
    test_y = data_matrix[test_index, cfg.params.vector_lenght+2]

    
    proc = AC_PLT(
        n_clusters = cfg.params.kmeans.n_cluster, 
        random_state = cfg.params.random_state
        )
    
    print("iteration:", i+1)
    # Training process
    # start = time.time()
    proc.fit(train_X, train_y)
    # end = time.time()
    # print("Train process time:", end-start)
    
    k500_acc_top_test[i] = sum(proc.predict(test_X)==test_y)/len(test_y)
    i+=1

iteration: 1


iteration: 2
iteration: 3
iteration: 4
iteration: 5


In [10]:
k500_acc_top_test.mean()

0.3598628322032577