In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\santiago.bruzza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\santiago.bruzza\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Dataset

In [4]:
df = pd.read_excel('./dataset/nps_comments.xlsx')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1046 entries, 0 to 1045
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   order_number  1046 non-null   int64 
 1   comments      1046 non-null   object
 2   category      1046 non-null   object
 3   rating        1046 non-null   int64 
 4   order_source  1046 non-null   object
dtypes: int64(2), object(3)
memory usage: 41.0+ KB


## Split data into X and y
- X: order number and comments
- y: category

In [6]:
#Separo en X and y (en X me quedo con los comentarios y en y con la label que los clasifica). En el articulo X is a list and y is a numpy array
X, y = df[['order_number','comments']] , np.array(df['category'])

## Text Preprocessing (NLP)

- Remove all the special characters
- Remove all single characters
- Remove single characters from the start
- Substituting multiple spaces with single space
- Removing prefixed 'b'
- Converting to Lowercase
- **Lemmatization**: Lemmatization is done in order to avoid creating features that are semantically similar but syntactically different. For instance "cats" is converted into "cat"


In [7]:
documents = []

comments_list = list(X['comments'])

stemmer = WordNetLemmatizer()

for sen in range(0, len(comments_list)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(comments_list[sen]))
    
    # remove all single characters
    #document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization.
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)


## Convert text into numbers
We are going to use the **Bag of Words Model**, where we convert each unique word that appears in each comment into a feature.

Parameters:

- **max_features**: We retain the top max_features most frequent unique words. Infrequent words don't contribute significantly.
- **min_df**: It is the minimum number of comments in which a word must appear for us to consider it a feature. In other words, the word must appear in at least min_df comments to be considered (very rare words are not useful).
- **max_df**: We keep words that appear in a maximum of max_df [%] of the comments. Overly common words don't provide meaningful information (very common words are not helpful).
- **stop_words**: This is a list of words (usually standard) that do not contribute valuable information.

The Bag of Words assigns a score to each word based on how frequently it appears in a comment, but it doesn't take into account the total frequency across all comments. This is why we use TFIDF, where TF stands for "Term Frequency" and IDF stands for "Inverse Document Frequency".

Term frequency = (Number of Occurrences of a word)/(Total words in the document)

IDF(word) = Log((Total number of documents)/(Number of documents containing the word))

A word that is very frequent in a specific comment but less common in the rest will have a higher TFIDF score.

In [None]:
#TFIDF parameters (example)
#max_features=3000, min_df=2, max_df=0.5, stop_words=stopwords.words('spanish')

In [8]:
tfidfconverter = TfidfVectorizer()
numerical_X = tfidfconverter.fit_transform(documents).toarray()

## Split data into Train and Test datasets

In [9]:
X_train, X_test, y_train, y_test, orders_train, orders_test = train_test_split(
    numerical_X, y, X['order_number'], test_size=0.2, random_state=42)

## Classification Model and Predictions

In [10]:
#Modelo de clasificación
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 

In [11]:
#Predictions
y_pred = classifier.predict(X_test)
proba = classifier.predict_proba(X_test)

## Evaluate the model
- Classification Report
- Confusion Matrix
- Accuracy Score

In [15]:
#Metricas de evalución del modelo
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print()
print(classification_report(y_test,y_pred))
print()
print('accuracy: ',accuracy_score(y_test, y_pred))



[[ 3  0  0  1  0  0  0  0]
 [ 0  2  0  4  1  0  7  0]
 [ 0  0  3  0  6  0  2  0]
 [ 0  0  0 86  2  0  1  0]
 [ 1  0  2  5 32  0  0  0]
 [ 0  0  0  1  0  0  0  0]
 [ 0  0  0  2  1  0 40  0]
 [ 0  0  1  1  1  0  0  5]]

                             precision    recall  f1-score   support

                   Negativo       0.75      0.75      0.75         4
                     compra       1.00      0.14      0.25        14
                    entrega       0.50      0.27      0.35        11
                   positivo       0.86      0.97      0.91        89
problemas con la plataforma       0.74      0.80      0.77        40
                    reclamo       0.00      0.00      0.00         1
                  sin stock       0.80      0.93      0.86        43
                 sugerencia       1.00      0.62      0.77         8

                   accuracy                           0.81       210
                  macro avg       0.71      0.56      0.58       210
               weight

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Save the model

In [105]:
#Save the model
with open('nps_classifier', 'wb') as picklefile:
    pickle.dump(classifier,picklefile)

## Read the model
and save into a variable named model

In [None]:
#Read the model and save into a variable named model
with open('nps_classifier', 'rb') as training_model:
    model = pickle.load(training_model)

## Hyperparameter tuning:

In [16]:
#Default model parameters:
classifier.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1000,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

### Random Search

In [17]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 4000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [18]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 200, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits




### Best model
- Best parameters
- Predictions
- Accuracy

In [19]:
#Best params
rf_random.best_params_

{'n_estimators': 1466,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 100,
 'bootstrap': False}

In [20]:
#Best model
best_random = rf_random.best_estimator_

#Fit and predict
best_random.fit(X_train, y_train) 
y_pred_best = best_random.predict(X_test)

#Accuracy
print(accuracy_score(y_test, y_pred_best))

0.7952380952380952


## Model's output

The model's output will be organized into a DataFrame containing the following columns: order number, comment, predicted category, and probability. Additionally, I will introduce a new column labeled "Final category" This column will contain "Manual Review" if the probability is less than 80%, and it will display the predicted category if the probability exceeds 80%.

In [22]:
#Transform numerical vector to text
text = []

for i in range(len(X_test)):

    # Retrieve the feature names using the workaround
    feature_names = tfidfconverter.inverse_transform(X_test)
    feature_names = feature_names[i] 

    # Join the feature names to reconstruct the text
    text_str = ' '.join(feature_names)

    text.append(text_str)

In [23]:
#Dataframe with order_number, procceded comment, predicted category and probability
output_df = pd.DataFrame({'order_number':orders_test ,'comment': text, 'predicted_category': y_pred, 'probability': proba.max(axis=1)})

In [25]:
#final_category column
output_df['final_category'] = output_df.apply(lambda row: row['predicted_category'] if row['probability'] > 0.8 else 'manual review', axis=1)

In [30]:
#Merge the original data with output_df
df_final = pd.merge(output_df.drop(['comment'], axis=1), df, how='left', left_on=['order_number'], right_on=['order_number'])

In [38]:
#change column name "category" for "true_category"
df_final.rename(columns={'category': 'true_category'}, inplace=True)

In [39]:
#Model's output
df_final.head(2)

Unnamed: 0,order_number,predicted_category,probability,final_category,comments,true_category,rating,order_source
0,774,sin stock,0.658,manual review,hay muchos productos que queria comprar y no l...,sin stock,10,WEB
1,281,positivo,0.984,positivo,fácil y rapido,positivo,10,APP
