- **Machine learning applications on text classification for companies**

The dataset that will be worked on is called "TCC.xlsx" and contains the information of the requirements, requests and petitions presented to the company SIGMA Ingeniería S.A of Manizales in the technical support area. 

The fields that will be taken into account for this work will be "description" and "category", the idea is to find the best performance technique in the classification of descriptions to implement in the company and perform the automatic classification of future requirements, it is intended that Through the predicted category, the protocols of solution to the request presented by the client are provided to offer a better quality in the response and also, reduce the time in the response by the service area and technical support of the company towards the client. 

# Libraries

In [2]:
#Classification Methods
from sklearn import svm

#NLP
import nltk 
nltk.download('stopwords') 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords 
import spacy 
import es_core_news_sm

#Metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
# from yellowbrick.classifier import ClassificationReport 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

#Tools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import model_selection
from scipy.sparse import csr_matrix 
import string 
import time as tm
import os

import warnings
warnings.filterwarnings('ignore')
import joblib
import dill as pickle

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Functions

In [3]:
nlp = spacy.load("es_core_news_sm")

def lemmatizer(text):  
    doc = nlp(text)
    return ' '.join([word.lemma_ for word in doc]) 

In [18]:
def class_balance_over_sampling(features, labels, HO=False, CV=True, methods_list=["SMOTE"]):
    
    best_acc=list()
    for method in methods_list:
        if method == "RandomOverSampler":
            print(method)
            print("originals labels unique: ",np.unique(labels, return_counts=True)) 
            X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                                test_size=0.20, random_state=8, stratify=labels)
            sampler = RandomOverSampler(random_state=21) 
            X_train, y_train = sampler.fit_resample(X_train, y_train)             
            print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
            print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 
            classifier_metrics(X_train,X_test,y_train,y_test,HO=HO,CV=CV)
            
        elif method == "SMOTE":
            print(method)
#             print("originals labels unique: ",np.unique(labels, return_counts=True)) 
            X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                                test_size=0.20, random_state=8, stratify=labels)
            sampler = SMOTE(random_state=21,n_jobs=-1) 
            X_train, y_train = sampler.fit_resample(X_train, y_train)             
#             print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
#             print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 
            classifier_metrics(X_train,X_test,y_train,y_test,HO=HO,CV=CV)

        elif method == "SMOTEN":
            print(method)
            print("originals labels unique: ",np.unique(labels, return_counts=True)) 
            X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                                test_size=0.20, random_state=8, stratify=labels)
            sampler = SMOTEN(random_state=21,n_jobs=-1)
            X_train, y_train = sampler.fit_resample(X_train, y_train)             
            print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
            print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 
            classifier_metrics(X_train,X_test,y_train,y_test,HO=HO,CV=CV)
            
        elif method == "ADASYN":
            print(method)
            print("originals labels unique: ",np.unique(labels, return_counts=True)) 
            X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                                test_size=0.20, random_state=8, stratify=labels)
            sampler = ADASYN(random_state=21,n_jobs=-1) 
            X_train, y_train = sampler.fit_resample(X_train, y_train)             
            print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
            print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 
            classifier_metrics(X_train,X_test,y_train,y_test,HO=HO,CV=CV)
            
        elif method == "BorderlineSMOTE":
            print(method)
            print("originals labels unique: ",np.unique(labels, return_counts=True)) 
            X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                                test_size=0.20, random_state=8, stratify=labels)
            sampler = BorderlineSMOTE(random_state=21,n_jobs=-1) 
            X_train, y_train = sampler.fit_resample(X_train, y_train)             
            print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
            print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 
            classifier_metrics(X_train,X_test,y_train,y_test,HO=HO,CV=CV)
            
        elif method == "KMeansSMOTE":
            print(method)
            print("originals labels unique: ",np.unique(labels, return_counts=True)) 
            X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                                test_size=0.20, random_state=8, stratify=labels)
            sampler = KMeansSMOTE(random_state=21,n_jobs=-1, k_neighbors=np.unique(y_test).shape[0]) 
            X_train, y_train = sampler.fit_resample(X_train, y_train)             
            print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
            print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 
            classifier_metrics(X_train,X_test,y_train,y_test,HO=HO,CV=CV)
            
        elif method == "SVMSMOTE":
            print(method)
            print("originals labels unique: ",np.unique(labels, return_counts=True)) 
            X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                                test_size=0.20, random_state=8, stratify=labels)
            sampler = SVMSMOTE(random_state=21,n_jobs=-1) 
            X_train, y_train = sampler.fit_resample(X_train, y_train)             
            print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
            print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 
            classifier_metrics(X_train,X_test,y_train,y_test,HO=HO,CV=CV)
            
        else:
            continue
            


In [19]:
def classifier_metrics(X_train,X_test,y_train,y_test,HO=True,CV=False):    
    def metrics(model):
        print("\nHold-Out in process...")
        start_time = tm.time()
        model.fit(X_train, y_train) 
        
        model_filename = "Modelo_SVM_V4.sav" # name given to th trained model
        joblib.dump(classifier, model_filename) # Save the model
        print('Model is saved into to disk successfully Using Job Lib')
        
        
        TIME = tm.time() - start_time 
        print("Time, Training: {0:.4f} [seconds]".format(TIME))
        start_time = tm.time()
        y_pred = model.predict(X_test)
        TIME = tm.time() - start_time 
        print("Time, Prediction: {0:.4f} [seconds]".format(TIME))
        accuracy_s  = accuracy_score(y_test,y_pred) 
        f1_s        = f1_score(y_test,y_pred,average='weighted')
        recall_s    = recall_score(y_test,y_pred,average='weighted')
        precision_s = precision_score(y_test,y_pred,average='weighted')
        print('accuracy_score: {0:.4f}'.format(accuracy_s))
        print('f1_score: {0:.4f}'.format(f1_s))
        print('recall_score: {0:.4f}'.format(recall_s))
        print('precision_score: {0:.4f}'.format(precision_s))
        print ('\n clasification report:\n', classification_report(y_test, y_pred, digits = 4))
        print('\nCross-Validation in process...')
        start_time = tm.time() 
        kfold = model_selection.KFold(n_splits=10)
        y_CV = np.concatenate((y_train,y_test))
        if "GaussianNB" in str(name) or "LinearDiscriminantAnalysis" in str(name):
            X_CV = np.concatenate((X_train,X_test))
            cv_results = np.array(model_selection.cross_val_score(model, X_CV, y_CV, cv=kfold, scoring='accuracy', n_jobs=-1))
        else:
            X_CV = np.concatenate((X_train.toarray(),X_test.toarray()))
            X_CV = csr_matrix(X_CV)
            cv_results = np.array(model_selection.cross_val_score(model, X_CV, y_CV, cv=kfold, scoring='accuracy', n_jobs=-1))
        
        cv_results = cv_results[np.logical_not(np.isnan(cv_results))] 
        TIME = tm.time() - start_time 
        print("Time, CV: {0:.4f} [seconds]".format(TIME))
        print('CV: {0:.4f} {1:.4f}'.format(cv_results.mean(),cv_results.std()))

    for name in classifier:
        print ("---------------------------------------------------------------------------------\n") 
        print(str(name))
        if "GaussianNB" in str(name) or "LinearDiscriminantAnalysis" in str(name):
            X_train=csr_matrix(X_train) 
            X_test =csr_matrix(X_test) 
            X_train=X_train.toarray() 
            X_test=X_test.toarray() 
        else:
            X_train=csr_matrix(X_train)
            X_test=csr_matrix(X_test)
            
        metrics(name)
        print()
        

In [6]:
path_figures = "../images"
if not os.path.exists(path_figures):
    os.makedirs(path_figures)

# Classification report
def CR_viz():
    ax = plt.figure(figsize=(15,20)) 
    visualizer = ClassificationReport(model_selected, classes=classes, support=True,  
                                      cmap='Blues', title="Classification Report - "+model_name)
    visualizer.fit(X_train, y_train)   
    visualizer.score(X_test, y_test)      
    visualizer.poof()
    ax.show()
    ax.savefig(path_figures+"/"+model_name+"_CR"+".pdf", bbox_inches = "tight") 

# Confusion matrix
def CM_viz():
    model_selected.fit(X_train, y_train) 
    y_pred = model_selected.predict(X_test) 
    conf = confusion_matrix(y_test, y_pred) 
    plt.figure(figsize=(42 , 42)) 
    annot_kws={'fontsize':20, 'verticalalignment':'center' }
    ax = sns.heatmap(conf, annot=True, cmap='Blues',fmt = 'd',annot_kws= annot_kws, xticklabels=np.unique(y_test), yticklabels=np.unique(y_test)) 
    ax.set(title="Confusion Matrix with labels", xlabel="Predicted Values", ylabel="Actual Values")
    sns.set(font_scale=2)
    plt.title("Confusion Matrix - "+model_name, fontsize = 35)
    plt.xlabel("Predicted Values", fontsize = 35)
    plt.ylabel("Actual Values", fontsize = 35)
    plt.savefig(path_figures+"/"+model_name+"_CM"+".pdf", bbox_inches = "tight") 

# Dataset preparation

In [10]:
# Loading Dataset
filename = '../Data/TCC_NEW_V2.xlsx'
DataSet0 = pd.read_excel(os.path.join(filename), engine='openpyxl')
DataSet0.shape 
DataSet0

Unnamed: 0,tik_codigo,descripcion,categoria
0,TIK7201,En Seguimiento documental de Autos hay un camp...,Duda en uso de campos
1,TIK8740,no se muestra nombre de ruta solo muestra núme...,Duda en uso de campos
2,TIKS01,Para que sirve el campo que aparece en el reporte,Duda en uso de campos
3,TIKS02,No entiendo a que se refiere el campo 2 del mo...,Duda en uso de campos
4,TIKS03,El campo 5 en la plataforma de corpo para que ...,Duda en uso de campos
...,...,...,...
2141,TIK11427,No definido: Actividades de mercadeo,0- No definido
2142,TIK11485,No definido: Eliminar los expedientes OOCA-001...,0- No definido
2143,TIK11441,No definido: Construcción de informe mensual P...,0- No definido
2144,TIK10979,No definido: Planeación del proyecto Desarroll...,0- No definido


In [11]:
# We eliminate the NaN or empty data present in the columns to work
PorBorrar1 = DataSet0[DataSet0['descripcion'].isnull()].index
DataSet0=DataSet0.drop(PorBorrar1, axis=0).reset_index(drop = True)
PorBorrar1 = DataSet0[DataSet0['categoria'].isnull()].index
DataSet0=DataSet0.drop(PorBorrar1, axis=0).reset_index(drop = True)

# Best Machine learning application (SVM)

## 4. Dataset with Preprocessing and Balancing, Optimization of parameters (DPBO)

In [22]:
# Convert texts to lowercase
DataSet0['descripcion'] = DataSet0['descripcion'].str.lower()
DataSet0.shape

(2146, 3)

In [23]:
# Remove punctuation marks
punct = string.punctuation

for c in punct:
    for fila in range(len(DataSet0)):
        DataSet0['descripcion'][fila] = DataSet0['descripcion'][fila].replace(c, " ")
DataSet0.shape

(2146, 3)

In [24]:
# Apply stemming to the description field
DataSet0['descripcion'] = DataSet0['descripcion'].apply(lambda x: lemmatizer(x)) 
DataSet0.shape

(2146, 3)

In [25]:
'''
'X' and 'y' are defined, 'X' will be in charge of containing the characteristics of the dataset that for this case
is the description that will define the category to which it belongs and 'y' contains the values of the labels, 
in this case of the possible categories defined.
'''

X = DataSet0['descripcion'] 
y = DataSet0['categoria'] 

In [26]:
'''
Bearing in mind that the problem worked consists of text, it is necessary to transform them and prepare them for 
later use, in this case, each of the words contained in the description will be encoded in floating point values 
for use in machine learning algorithms, this process is also known as feature extraction or vectorization 
using the TfidfVectorizer library
'''

vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words=stopwords.words("spanish"))
X = vectorizer.fit_transform(X) 

# Save word dictionary created by TfidfVectorizer
pickle.dump(vectorizer.vocabulary_,open("Features_SVM_V4.pkl","wb"))

In [27]:
#ML Models CON TCC_NEW_V2
classifier=[svm.SVC(C=1000000, gamma=0.000001, kernel = 'rbf')
            ] 

methods_list=["SMOTE"
             ]
             
# class_balance_over_sampling(features, labels, HO=True, CV=False, methods_list=methods_list)
class_balance_over_sampling(X, y, HO=True, CV=False, methods_list=methods_list)

SMOTE
---------------------------------------------------------------------------------

SVC()

Hold-Out in process...
Model is saved into to disk successfully Using Job Lib
Time, Training: 0.6902 [seconds]
Time, Prediction: 0.0630 [seconds]
accuracy_score: 0.8884
f1_score: 0.8892
recall_score: 0.8884
precision_score: 0.9098

 clasification report:
                                                            precision    recall  f1-score   support

                                           0- No definido     1.0000    1.0000    1.0000        12
                                   1- Nuevo requerimiento     1.0000    1.0000    1.0000        13
        Adición o modificación de funcionalidad en perfil     0.7500    0.9000    0.8182        10
                                    Auditoria del sistema     1.0000    0.4444    0.6154         9
                            Calculo erroneo en formulario     0.9231    1.0000    0.9600        12
                                     Cambiar datos po

In [29]:
# we load the model and verify its performance by making a prediction 

model_filename = "Modelo_SVM_V4.sav"

my_model = joblib.load(model_filename)

Descripcion_ticket = input(print("Ingrese la descripción del Ticket: ")) # configuracion de GPS para Palmira
print(f" El ticket ingresado es: {Descripcion_ticket}")

Prediccion = my_model[0].predict(vectorizer.transform([Descripcion_ticket]))

Prediccion

Ingrese la descripción del Ticket: 
NoneNuevo requerimiento: se pide realizar estudio de mercadeo 
 El ticket ingresado es: Nuevo requerimiento: se pide realizar estudio de mercadeo 


array(['1- Nuevo requerimiento'], dtype=object)