MACHINE LEARNING (Supervised learning)

In [13]:

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, fbeta_score, make_scorer, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC


import warnings
warnings.filterwarnings("ignore")



In [14]:
df_data = pd.read_csv(filepath_or_buffer="amzn_customer_reviews_cleaned.csv", sep=",")

In [15]:
df_data.head(3)

Unnamed: 0,Pseudo,Title,Review,Rating,Verified Purchase,Date,Score,Compound,Sentiment,New rating,New rating.1,New date,Country
0,Assault Kittens,really good option portability,impressive form factor really good balance siz...,4.5 out of 5 stars,Yes,"Reviewed in the United States on June 18, 2021","{'neg': 0.014, 'neu': 0.736, 'pos': 0.249, 'co...",0.9941,positive,4.5,4.5,2021-06-18,United States
1,Kenneth Cramer,excellent portable gam,writ review anyo fence purchas since really co...,5.0 out of 5 stars,Yes,"Reviewed in the United States on July 7, 2021","{'neg': 0.016, 'neu': 0.88, 'pos': 0.104, 'com...",0.9921,positive,5.0,5.0,2021-07-07,United States
2,Assault Kittens,best inch world,sold macbook best decision lifeif us macbook y...,1.0 out of 5 stars,Yes,"Reviewed in the United States on June 18, 2021","{'neg': 0.0, 'neu': 0.854, 'pos': 0.146, 'comp...",0.8779,positive,1.0,1.0,2021-06-18,United States


In [16]:
df_data['Sentiment']

0       positive
1       positive
2       positive
3       positive
4       positive
          ...   
1124    negative
1125    positive
1126    positive
1127    positive
1128     neutral
Name: Sentiment, Length: 1129, dtype: object

In [17]:
cross_validation = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
fthree_scorer = make_scorer(fbeta_score, average="macro", beta=3)

In [18]:
# split train valid test split
X = df_data["Review"]
y = df_data["Sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True)

In [19]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((846,), (283,), (846,), (283,))

In [20]:
vectorizer = TfidfVectorizer(
    encoding="utf-8",
    lowercase=False,
    tokenizer=None,
    analyzer="word",
    stop_words=None,
    ngram_range=(1, 2),
    min_df=1,
    norm="l2",
    use_idf=True
)

X_train = vectorizer.fit_transform(X_train).todense()
X_test = vectorizer.transform(X_test).todense()

In [21]:
X_train.shape, X_test.shape

((846, 33602), (283, 33602))

In [22]:
print("-"*65)
print("Train shape: ", X_train.shape)
print("Test shape: ", X_test.shape)
print("-"*65)
print("Originally:")
print(df_data.Sentiment.value_counts(normalize=True))
print("\n")
print("Train:")
print(y_train.value_counts(normalize=True))
print("\n")
print("Test:")
print(y_test.value_counts(normalize=True))
print("-"*65)

-----------------------------------------------------------------
Train shape:  (846, 33602)
Test shape:  (283, 33602)
-----------------------------------------------------------------
Originally:
Sentiment
positive    0.800709
negative    0.137290
neutral     0.062002
Name: proportion, dtype: float64


Train:
Sentiment
positive    0.797872
negative    0.141844
neutral     0.060284
Name: proportion, dtype: float64


Test:
Sentiment
positive    0.809187
negative    0.123675
neutral     0.067138
Name: proportion, dtype: float64
-----------------------------------------------------------------


SUPPORT VECTOR MACHINE

In [23]:
pipeline = Pipeline(
    [
        (
            "svc_clf",
            SVC(
                degree=3,
                gamma="auto",
                coef0=0.0,
                shrinking=True,
                probability=False,
                tol=0.001,
                cache_size=200,
                class_weight="balanced",
                verbose=False,
                max_iter=-1,
                decision_function_shape="ovr",
                break_ties=False,
                random_state=42
            )
        )
    ]
)

param_grid = {
    "svc_clf__C": [0.001, 0.01, 0.1, 1.0, 10, 100],
    "svc_clf__kernel": ["linear", "rbf", "sigmoid"]
}


In [24]:
svc_grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cross_validation,
    scoring=fthree_scorer,
    n_jobs=-1,
    verbose=True
)


y_train_array = np.asarray(y_train)
X_train_array = np.asarray(X_train)
svc_grid.fit(X_train_array, y_train_array)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


In [25]:
dict_params = svc_grid.best_params_
dict_params

{'svc_clf__C': 1.0, 'svc_clf__kernel': 'linear'}

In [26]:
svc_clf = SVC(
    C=dict_params["svc_clf__C"],
    kernel=dict_params["svc_clf__kernel"],
    degree=3,
    gamma="auto",
    coef0=0.0,
    shrinking=True,
    probability=False,
    tol=0.001,
    cache_size=200,
    class_weight="balanced",
    verbose=False,
    max_iter=-1,
    decision_function_shape="ovr",
    break_ties=False,
    random_state=42
)
y_train_array = np.asarray(y_train)
X_train_array = np.asarray(X_train)
svc_clf.fit(X_train_array, np.ravel(y_train_array))
X_test_array=np.asarray(X_test)
y_pred= svc_clf.predict(X_test_array)

In [27]:
cm = pd.crosstab(y_test, y_pred, rownames=["Real class"], colnames=["Predicted class"])
cm

Predicted class,negative,neutral,positive
Real class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,10,2,23
neutral,3,5,11
positive,3,1,225


In [28]:
accuracy_svc = round(accuracy_score(y_true=y_test, y_pred=y_pred), 4)
recall_svc = round(recall_score(y_true=y_test, y_pred=y_pred, average="macro"), 4)
precision_svc = round(precision_score(y_true=y_test, y_pred=y_pred, average="macro"), 4)
f1_score_svc = round(f1_score(y_true=y_test, y_pred=y_pred, average="macro"), 4)
f3_score_svc = round(fbeta_score(y_true=y_test, y_pred=y_pred, average="macro", beta=3), 4)


In [29]:
print(accuracy_svc)

0.8481


In [30]:
models = pd.DataFrame(
    {
        "Model": [
            
            "SVM"

        ],
        "Accuracy": [
          

            accuracy_svc
   
        ],
        "Recall": [
          

            recall_svc
          ], 
        "Precision": [
            
           
            precision_svc
        
        ],
        "F1-score": [
            
          
            f1_score_svc
           
          
        ],
        "F3-score": [
            
           
            f3_score_svc
            
        ]
    }
)

models.sort_values(by="Accuracy", ascending=False, ignore_index=True)

Unnamed: 0,Model,Accuracy,Recall,Precision,F1-score,F3-score
0,SVM,0.8481,0.5105,0.7062,0.5616,0.5171


Using SMOTE to adress the classes imbalance 

In [31]:
from imblearn.over_sampling import SMOTE



In [32]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_array, y_train_array)

# Now use X_train_resampled and y_train_resampled for training
svc_grid.fit(X_train_resampled, y_train_resampled)


  File "C:\Users\asmae\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Fitting 3 folds for each of 18 candidates, totalling 54 fits


In [33]:

svc_clf = SVC(
    C=dict_params["svc_clf__C"],
    kernel=dict_params["svc_clf__kernel"],
    
)

# Fitting model
svc_clf.fit(X_train_resampled, np.ravel(y_train_resampled))


In [34]:
accuracy_svc = accuracy_score(y_test, y_pred)
recall_svc = recall_score(y_test, y_pred, average='macro')
precision_svc = precision_score(y_test, y_pred, average='macro')
f1_score_svc = f1_score(y_test, y_pred, average='macro')
f3_score_svc = fbeta_score(y_test, y_pred, beta=3, average='macro')


In [35]:
models = pd.DataFrame({
    "Model": ["SVM"],
    "Accuracy": [accuracy_svc],
    "Recall": [recall_svc],
    "Precision": [precision_svc],
    "F1-score": [f1_score_svc],
    "F3-score": [f3_score_svc]
})



In [36]:
models_sorted = models.sort_values(by="Accuracy", ascending=False, ignore_index=True)
print(models_sorted)


  Model  Accuracy    Recall  Precision  F1-score  F3-score
0   SVM  0.848057  0.510468   0.706242  0.561553  0.517091


Adjusting classes weights 

In [37]:
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))


In [38]:
pipeline = Pipeline([
    ("svc_clf", SVC(class_weight=class_weights_dict, 
                   degree=3,
                   gamma="auto",
                   coef0=0.0,
                   shrinking=True,
                   probability=False,
                   tol=0.001,
                   cache_size=200,
                   verbose=False,
                   max_iter=-1,
                   decision_function_shape="ovr",
                   break_ties=False,
                   random_state=42))
])


In [39]:
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights
class_weights_array = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

# Create a dictionary mapping each class label to its corresponding weight
class_labels = np.unique(y_train)
class_weights_dict = {class_labels[i]: class_weights_array[i] for i in range(len(class_labels))}


In [40]:
# Define the SVM model with class weights
svc = SVC(class_weight=class_weights_dict, random_state=42)

# Grid Search for parameter tuning
param_grid = {
    "C": [0.001, 0.01, 0.1, 1.0, 10, 100],
    "kernel": ["linear", "rbf", "sigmoid"]
}
cross_validation = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
grid_search = GridSearchCV(svc, param_grid, cv=cross_validation, scoring='f1_macro')
grid_search.fit(X_train_array, y_train_array)

# Best parameters
best_params = grid_search.best_params_

# Train the model with the best parameters
svc_best = SVC(class_weight=class_weights_dict, C=best_params['C'], kernel=best_params['kernel'], random_state=42)
svc_best.fit(X_train_array, y_train_array)

In [41]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Predict on the test data
y_pred = svc_best.predict(X_test_array)

# calculate metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')

#  metrics
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")



Accuracy: 0.8409893992932862
F1 Score: 0.5703260941754666
Precision: 0.6530298973672468
Recall: 0.5317617187072484


In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights (optional if you want manual control over class weights)
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_array), y=y_train_array)
class_weights_dict = {np.unique(y_train_array)[i]: class_weights[i] for i in range(len(class_weights))}

# Define the pipeline
pipeline = Pipeline([
    ("log_clf", LogisticRegression(class_weight=class_weights_dict, random_state=42, n_jobs=-1))
])

# Define the parameter grid
param_grid = {
    "log_clf__C": [0.001, 0.01, 0.1, 1.0, 10, 100],
    "log_clf__solver": ["saga", "liblinear"],
    "log_clf__penalty": ["l1", "l2"]
}

# grid search
log_grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cross_validation,
    scoring=fthree_scorer,
    n_jobs=-1,
    verbose=True
)

# model fitting
log_grid.fit(X_train_array, y_train_array)


dict_params = log_grid.best_params_

log_clf = LogisticRegression(
    C=dict_params["log_clf__C"],
    solver=dict_params["log_clf__solver"],
    penalty=dict_params["log_clf__penalty"],
    class_weight=class_weights_dict,  # Or "balanced"
    random_state=42,
    n_jobs=-1
)

#  model fitting
log_clf.fit(X_train_array, y_train_array)
y_pred = log_clf.predict(X_test_array)


Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [43]:

accuracy_log = accuracy_score(y_test, y_pred)
recall_log = recall_score(y_test, y_pred, average='macro')
precision_log = precision_score(y_test, y_pred, average='macro')
f1_score_log = f1_score(y_test, y_pred, average='macro')
f3_score_log = fbeta_score(y_test, y_pred, beta=3, average='macro')

# metrics
print(f"Accuracy: {accuracy_log:.4f}")
print(f"Recall: {recall_log:.4f}")
print(f"Precision: {precision_log:.4f}")
print(f"F1 Score: {f1_score_log:.4f}")
print(f"F3 Score: {f3_score_log:.4f}")


Accuracy: 0.7915
Recall: 0.5678
Precision: 0.6002
F1 Score: 0.5682
F3 Score: 0.5656


In [44]:
from sklearn.metrics import confusion_matrix
import pandas as pd

conf_matrix = confusion_matrix(y_test, y_pred)

conf_matrix_df = pd.DataFrame(conf_matrix, index=['True Negative', 'True Neutral', 'True Positive'], columns=['Predicted Negative', 'Predicted Neutral', 'Predicted Positive'])

print("Confusion Matrix:")
print(conf_matrix_df)


Confusion Matrix:
               Predicted Negative  Predicted Neutral  Predicted Positive
True Negative                  18                  3                  14
True Neutral                    7                  6                   6
True Positive                  27                  2                 200


Further adjusting weights 


In [45]:

custom_class_weights = {
    'negative': 5,
    'neutral': 5,
    'positive': 0.5
}


In [46]:

pipeline = Pipeline([
    ("svc_clf", SVC(class_weight=custom_class_weights, random_state=42))
])

param_grid = {
    "svc_clf__C": [0.001, 0.01, 0.1, 1.0, 10, 100],
    "svc_clf__kernel": ["linear", "rbf", "sigmoid"]
}

cross_validation = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
grid_search = GridSearchCV(pipeline, param_grid, cv=cross_validation, scoring='f1_macro')
grid_search.fit(X_train_array, y_train_array)

best_params = grid_search.best_params_

C = best_params['svc_clf__C']
kernel = best_params['svc_clf__kernel']

svc_best = SVC(class_weight=custom_class_weights, C=C, kernel=kernel, random_state=42)
svc_best.fit(X_train_array, y_train_array)


In [47]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, fbeta_score
import pandas as pd


y_pred = svc_best.predict(X_test_array)

conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix_df = pd.DataFrame(conf_matrix, 
                              index=['True Negative', 'True Neutral', 'True Positive'], 
                              columns=['Predicted Negative', 'Predicted Neutral', 'Predicted Positive'])
print("Confusion Matrix:")
print(conf_matrix_df)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='macro')
precision = precision_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
f3 = fbeta_score(y_test, y_pred, beta=3, average='macro')

print(f"Accuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"F3 Score: {f3:.4f}")


Confusion Matrix:
               Predicted Negative  Predicted Neutral  Predicted Positive
True Negative                  13                  3                  19
True Neutral                    4                  5                  10
True Positive                   7                  1                 221
Accuracy: 0.8445
Recall: 0.5332
Precision: 0.6604
F1 Score: 0.5735
F3 Score: 0.5392


In [67]:
from joblib import dump 
dump(svc_best, 'model_filename.joblib')


['model_filename.joblib']

In [68]:
dump(vectorizer, 'fitted_vectorizer.pkl')

['fitted_vectorizer.pkl']