In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\santiago.bruzza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\santiago.bruzza\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### ETL

In [4]:
df = pd.read_excel('./dataset/nps_comments.xlsx')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1046 entries, 0 to 1045
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   order_number  1046 non-null   int64 
 1   comments      1046 non-null   object
 2   category      1046 non-null   object
 3   rating        1046 non-null   int64 
 4   order_source  1046 non-null   object
dtypes: int64(2), object(3)
memory usage: 41.0+ KB


In [12]:
#Separo en X and y (en X me quedo con los comentarios y en y con la label que los clasifica). En el articulo X is a list and y is a numpy array
X, y = df[['order_number','comments']] , np.array(df['new_category'])

In [13]:
#Transformo los comentarios para estandarizarlo y quedarme solo con las palabras que aporten info
documents = []

comments_list = list(X['comments'])

stemmer = WordNetLemmatizer()

for sen in range(0, len(comments_list)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(comments_list[sen]))
    
    # remove all single characters
    #document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization. Lemmatization is done in order to avoid creating features that are semantically similar but syntactically different. For instance "cats" is converted into "cat"
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)


##### Convert text into numbers. 
Vamos a usar el Bag of Words Model, convertimos cada palabra unica que aparece en cada comentario en una feature

Parameters:
- max_features: Nos quedamos con las max_features palabras unicas mas frecuentes. Palabras poco frecuentes no aportan mucho
- min_df: Es el min numero de comentarios que tienen que incluir la palabra para que la consideremos una feature. Es decir la palabra tiene que aparecer minimo en min_df comentarios para tenerla en cuenta (palabras muy raras no sirven)
- max_df: Nos quedamos con las palabras que aparecen como maximo en un max_df [%] de los comentarios. Palabras demasiado comunes no aportan nada (palabras muy comunes no sirven)
- stop_words: es una lista de palabras (medio std) que no aportan info

The bags of words le asigna un score a cada palabra segun la frecuencia con la que aparece en un comentario pero no tiene en cuenta la frecuencia total (teniendo en cuenta todos los comentarios)
por eso usamos TFIDF, The TF stands for "Term Frequency" while IDF stands for "Inverse Document Frequency".

Term frequency = (Number of Occurrences of a word)/(Total words in the document)

IDF(word) = Log((Total number of documents)/(Number of documents containing the word))

Palabra muy frecuente en el comentario y poco frecuente en el resto -> +TFIDF

In [None]:
#Parametros que use en TFIDF pero da mejor con lo de default
#max_features=3000, min_df=2, max_df=0.5, stop_words=stopwords.words('spanish')

In [35]:
tfidfconverter = TfidfVectorizer()
numerical_X = tfidfconverter.fit_transform(documents).toarray()

In [36]:
#Separo en Train and Test datasets y mantengo el numero de orden para poder identificar cada comentario al final
X_train, X_test, y_train, y_test, orders_train, orders_test = train_test_split(
    numerical_X, y, X['order_number'], test_size=0.2, random_state=42)


#### Modelo y predicciones

In [37]:
#Modelo de clasificación
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 

In [38]:
#Predicciones
y_pred = classifier.predict(X_test)
proba = classifier.predict_proba(X_test)

In [39]:
pd.DataFrame(y_test).value_counts()

positivo                       92
sin stock                      51
sin comentario                 30
problemas con la plataforma    28
compra                         22
entrega                        10
Negativo                        5
sugerencia                      4
sin sentido                     3
reclamo                         1
Name: count, dtype: int64

In [40]:
#Metricas de evalución del modelo
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))



[[ 3  0  0  2  0  0  0  0  0  0]
 [ 0  9  0  3  2  0  4  0  4  0]
 [ 0  0  5  2  2  0  0  0  1  0]
 [ 0  0  0 85  3  0  4  0  0  0]
 [ 0  0  3  5 19  0  1  0  0  0]
 [ 0  0  0  0  1  0  0  0  0  0]
 [ 0  0  0  1  1  0 28  0  0  0]
 [ 0  0  0  1  0  0  1  1  0  0]
 [ 0  1  0  1  2  0  0  0 47  0]
 [ 0  0  0  1  1  0  0  0  0  2]]
                             precision    recall  f1-score   support

                   Negativo       1.00      0.60      0.75         5
                     compra       0.90      0.41      0.56        22
                    entrega       0.62      0.50      0.56        10
                   positivo       0.84      0.92      0.88        92
problemas con la plataforma       0.61      0.68      0.64        28
                    reclamo       0.00      0.00      0.00         1
             sin comentario       0.74      0.93      0.82        30
                sin sentido       1.00      0.33      0.50         3
                  sin stock       0.90      0.9

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [105]:
#Save the model
with open('nps_classifier', 'wb') as picklefile:
    pickle.dump(classifier,picklefile)

In [None]:
#Read the model and save into a variable named model
with open('nps_classifier', 'rb') as training_model:
    model = pickle.load(training_model)

#### Hyperparameter tuning:

In [106]:
#Parametros del modelo sin hyperparameter tuning
classifier.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1000,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [109]:
#Hyperparameter tuning

from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 4000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [117]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 200, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


  warn(


In [118]:
#Best params
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 80,
 'bootstrap': True}

In [119]:
#Best model
best_random = rf_random.best_estimator_

#Fit and predict
best_random.fit(X_train, y_train) 
y_pred_best = best_random.predict(X_test)

#Accuracy
print(accuracy_score(y_test, y_pred_best))

  warn(


0.7560975609756098


In [129]:
y_pred_proba[1]

array([7.27924838e-04, 6.45089944e-05, 6.52554416e-02, 2.05998240e-01,
       4.61977727e-02, 4.68026141e-01, 2.19775093e-04, 1.88288514e-02,
       2.30577475e-03, 1.73755688e-02, 1.75000000e-01])

In [124]:
labels = np.argmax(y_pred_proba, axis=1)

In [122]:
labels = np.argmax(y_pred_proba, axis=1)
classes = classifier.classes_
labels = [classes[i] for i in labels]
print(accuracy_score(y_test, labels))

0.7601626016260162


#### Output del modelo:

df con los todos los datos, el comentario y la label que le predice el modelo

In [21]:
#Transformo el vector numerico a texto
text = []

for i in range(len(X_test)):

    # Retrieve the feature names using the workaround
    feature_names = tfidfconverter.inverse_transform(X_test)
    feature_names = feature_names[i] 

    # Join the feature names to reconstruct the text
    text_str = ' '.join(feature_names)

    text.append(text_str)

In [22]:
#Dataframe con numero de orden, commentario (procesado) y la label que predigo con su probabilidad
output_df = pd.DataFrame({'order_number':orders_test ,'comment': text, 'predicted_category': y_pred, 'probability': proba.max(axis=1)})

In [23]:
output_df.head(1)

Unnamed: 0,order_number,comment,predicted_category,probability
5281,21218335.0,buena carrito cosas lenta,problemas con la plataforma,0.78


In [24]:
#Creo una nueva columna que es igual a predicted_category si la probabilidad de esa label > 80% sino le pongo "manual review"
output_df['final_category'] = output_df.apply(lambda row: row['predicted_category'] if row['probability'] > 0.8 else 'manual review', axis=1)

In [25]:
#Con el numero de orden cruzo toda la info
df_final = pd.merge(output_df, df, how='left', left_on=['order_number'], right_on=['order_number'])

In [26]:
df_final['final_category'].value_counts()

final_category
manual review                  120
positivo                        59
sin stock                       36
sin comentario                  12
problemas con la plataforma      8
compra                           7
sugerencia                       2
entrega                          1
Negativo                         1
Name: count, dtype: int64

In [27]:
len(df_final)

246

In [28]:
df_final.head(2)

Unnamed: 0,order_number,comment,predicted_category,probability,final_category,rating,rating_created_at,comments,customer_id,fos_user_type,outlet_id,company_name,company_email,firstname,lastname,order_source,new_category
0,21218335.0,buena carrito cosas lenta,problemas con la plataforma,0.78,manual review,7,2023-03-15 13:31:59,Es buena pero es lenta la parte de agregar cos...,30033.0,0.0,30266.0,Almacen,alanherrera121094@gmail.com,Alan,Herrera,PWA,problemas con la plataforma
1,21260995.0,bueno cantidades carrito muchas producto ser v...,positivo,0.692955,manual review,7,2023-03-29 08:41:06,Ser√≠a bueno que haya alguna indicaci√≥n cuand...,78642.0,0.0,78936.0,TRADENEO,tradeneosa@gmail.com,Horacio,Camiletti,PWA,problemas con la plataforma
