# Multi-Aspect Modelling

In [61]:
'''
Import required packages and libraries for multi-aspect modelling
'''
import joblib
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [62]:
PATH = "../cleaning/multi_aspect.pkl"

data    = joblib.load(PATH)
matrix  = data["matrix"]
columns = data["columns"]

In [63]:
first_row_coo = matrix[0].tocoo()
for i, v in zip(first_row_coo.col, first_row_coo.data):
    print(f"Column: {columns[i]}, Value: {v}, Column Index: {i}")

Column: ProductId, Value: 1.0, Column Index: 0
Column: Helpfulness, Value: 1.0, Column Index: 1
Column: Score, Value: 4.0, Column Index: 2
Column: citrus gelatin, Value: 2.0, Column Index: 11947
Column: lewis, Value: 3.0, Column Index: 30318
Column: sugar, Value: 3.0, Column Index: 51383


## Training Testing Split

In [64]:
# Target variable we are trying to predict
target_col = columns.index("Score")

x = matrix[:, [i for i in range(matrix.shape[1]) if i != target_col]]   # Remove target variable from sparse matrix
y = matrix[:,columns.index("Score")].toarray().ravel()                  # Target variable extracted from sparse matrix

In [65]:
# Check dimensions of model inputs and labels
print(matrix.shape)
print(x.shape)
print(type(y))
print(np.shape(y))

(116302, 59427)
(116302, 59426)
<class 'numpy.ndarray'>
(116302,)


In [66]:
x_train, x_test, y_train, y_test = train_test_split(
    x,              # Sparse matrix
    y,              # Target column
    test_size=0.2,  # 20% test split
    random_state=42
)

## Modelling

In [None]:
# Instantiate models for training
knn = KNeighborsClassifier(n_neighbors=5, metric="euclidean")
nn  = MLPClassifier()
svc = SVC(kernel="sigmoid")
rf  = RandomForestClassifier(n_estimators=30, max_depth=None, min_samples_split=2)
dt  = DecisionTreeClassifier(criterion="entropy", max_depth=None, min_samples_split=2)

In [79]:
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

         1.0       0.10      0.25      0.14      1586
         2.0       0.06      0.06      0.06      1042
         3.0       0.06      0.06      0.06      1425
         4.0       0.17      0.11      0.13      3090
         5.0       0.72      0.67      0.69     16118

    accuracy                           0.50     23261
   macro avg       0.22      0.23      0.22     23261
weighted avg       0.53      0.50      0.51     23261



In [71]:
nn.fit(x_train, y_train)
y_pred = nn.predict(x_test)
print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00      1586
         2.0       0.00      0.00      0.00      1042
         3.0       0.00      0.00      0.00      1425
         4.0       0.00      0.00      0.00      3090
         5.0       0.69      1.00      0.82     16118

    accuracy                           0.69     23261
   macro avg       0.14      0.20      0.16     23261
weighted avg       0.48      0.69      0.57     23261



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [86]:
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)
print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00      1586
         2.0       0.00      0.00      0.00      1042
         3.0       0.00      0.00      0.00      1425
         4.0       0.00      0.00      0.00      3090
         5.0       0.69      1.00      0.82     16118

    accuracy                           0.69     23261
   macro avg       0.14      0.20      0.16     23261
weighted avg       0.48      0.69      0.57     23261



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [69]:
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

         1.0       0.50      0.41      0.45      1586
         2.0       0.45      0.23      0.30      1042
         3.0       0.39      0.17      0.24      1425
         4.0       0.45      0.17      0.24      3090
         5.0       0.77      0.94      0.84     16118

    accuracy                           0.72     23261
   macro avg       0.51      0.38      0.41     23261
weighted avg       0.67      0.72      0.68     23261



In [74]:
dt.fit(x_train, y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

         1.0       0.38      0.35      0.36      1586
         2.0       0.30      0.19      0.23      1042
         3.0       0.25      0.16      0.19      1425
         4.0       0.27      0.15      0.19      3090
         5.0       0.76      0.88      0.82     16118

    accuracy                           0.67     23261
   macro avg       0.39      0.34      0.36     23261
weighted avg       0.62      0.67      0.64     23261



In [83]:
counts = {}
for i in y_pred:
    if i in counts.keys():
        counts[i] += 1
    else:
        counts[i] = 1

In [84]:
sorted(counts.items())

[(5.0, 23261)]