In [1]:
import pandas as pd
import numpy as np
import spacy
import gensim
import random
import time


# Word Embedding
from sentence_transformers import SentenceTransformer

# Created functions
from text_cleaning import TextCleaner
from Vectorize import to_vector
from Experimentations import ParamSearch


  from .autonotebook import tqdm as notebook_tqdm


# Variables 

In [4]:
vector_length = 768

# Data Import

In [7]:
# Data import
# Relative Path of the dataset, change for your dataset
dataset_name = "cpn120"
file_path = 'data/'

if dataset_name == "cpn27":
    data = pd.read_csv(file_path+'CPN27.csv', delimiter=",")
elif dataset_name == "cpn120":
    data = pd.read_csv(file_path+'CPN27.csv', delimiter=",")
# else:
#     data = pd.read_csv(r'your-path/your-file.csv', delimiter=",")


# Model

In [8]:
# Timer
start = time.time()

model = SentenceTransformer('sentence-transformers/LaBSE')

descriptions_matrix = np.zeros( # creatign an empty matrix
    (
        len(data),          # the number of data points
        vector_length       # the number of components of the word embedding
    )
)

# Matrix filling 
# Change to the name of the descriptions of your dataset.
for i,description in enumerate(data.iloc[:,1]):
    vector = model.encode(description)
    descriptions_matrix[i,] = vector

# Concatenate the matrix with the data of each observation
data_matrix = np.concatenate([descriptions_matrix,data], axis=1)


# Remove of the 'Nan' data
data_matrix = data_matrix[~pd.isnull(data_matrix[:,:vector_length]).any(axis=1)]


end = time.time()
print("Exceution time:", end-start)

Exceution time: 203.36830377578735


# AC-PLT

In [None]:
from AC_PLT import AC_PLT

parameters = {'n_clusters': np.arange(50, 1800, 50)}
ac_plt = AC_PLT()

file_name = r'{}_{}_results.csv'.format(dataset_name, ac_plt.__class__.__name__)

clf_acplt = ParamSearch(ac_plt, parameters)
clf_acplt.fit(data_matrix[:, :vector_length], data_matrix[:, vector_length+2],file_name)

# Naive Bayes

In [None]:
from Experimentations import ParamSearch
from sklearn.naive_bayes import GaussianNB

parameters = {'var_smoothing': np.geomspace(1e-06, 1e+02, num=9)}
proc = GaussianNB()

file_name = r'{}_{}_results.csv'.format(dataset_name, proc.__class__.__name__)

clf = ParamSearch(proc, parameters)
clf.fit(data_matrix[:, :vector_length], data_matrix[:, vector_length+2],file_name)

# SVC

In [None]:
from Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='rbf'
parameters = {'C': np.geomspace(1e-05, 1e+01, num=7)}
svc_rbf = SVC(kernel=kernel, gamma='auto')
fileName = r'{}_{}{}_results.csv'.format(dataset_name, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_rbf, parameters)
clf_acplt.fit(data_matrix[:, :vector_length], data_matrix[:, vector_length+2], fileName)

In [None]:
from Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='linear'
parameters = {'C': np.geomspace(1e-05, 1e+01, num=7)}
svc_linear = SVC(kernel=kernel, gamma='auto')
fileName = r'{}_{}{}_results.csv'.format(dataset_name, svc_linear.__class__.__name__, kernel)


clf_acplt = ParamSearch(svc_linear, parameters)
clf_acplt.fit(data_matrix[:, :vector_length], data_matrix[:, vector_length+2])

In [None]:
from Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='poly'
parameters = {'degree': np.arange(1,7)}
svc_poly = SVC(kernel=kernel, gamma='auto')
fileName = r'{}_{}{}_results.csv'.format(dataset_name, svc_poly.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_poly, parameters)
clf_acplt.fit(data_matrix[:, :vector_length], data_matrix[:, vector_length+2], fileName)

In [None]:
from Experimentations import ParamSearch
from sklearn.svm import SVC

kernel='sigmoid'
parameters = {'C': np.geomspace(1e-05, 1e+01, num=7)}
svc_sigmoid = SVC(kernel=kernel, gamma='auto')
fileName = r'{}_{}{}_results.csv'.format(dataset_name, svc_rbf.__class__.__name__, kernel)

clf_acplt = ParamSearch(svc_sigmoid, parameters)
clf_acplt.fit(data_matrix[:, :vector_length], data_matrix[:, vector_length+2], fileName)

# Decision Tree

In [None]:
from Experimentations import ParamSearch
from sklearn.tree import DecisionTreeClassifier

criterion = 'gini'
parameters = {'max_leaf_nodes': np.arange(100, 1501, 100)}
desition_tree = DecisionTreeClassifier(criterion=criterion)
fileName = r'{}_{}_{}_results.csv'.format(dataset_name, desition_tree.__class__.__name__, criterion)

clf_acplt = ParamSearch(desition_tree, parameters)
clf_acplt.fit(data_matrix[:, :vector_length], data_matrix[:, vector_length+2], fileName)