# Multi-Aspect Modelling

In [1]:
'''
Import required packages and libraries for multi-aspect modelling
'''
import joblib
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
PATH = "../cleaning/multi_aspect.pkl"

data    = joblib.load(PATH)
matrix  = data["matrix"]
columns = data["columns"]

In [3]:
first_row_coo = matrix[0].tocoo()
for i, v in zip(first_row_coo.col, first_row_coo.data):
    print(f"Column: {columns[i]}, Value: {v}, Column Index: {i}")

Column: ProductId, Value: 1.0, Column Index: 0
Column: Helpfulness, Value: 1.0, Column Index: 1
Column: Score, Value: 4.0, Column Index: 2
Column: lewis, Value: 1.0, Column Index: 30318
Column: sugar, Value: 1.0, Column Index: 51383


## Training Testing Split

In [4]:
# Target variable we are trying to predict
target_col = columns.index("Score")

x = matrix[:, [i for i in range(matrix.shape[1]) if i != target_col]]   # Remove target variable from sparse matrix
y = matrix[:,columns.index("Score")].toarray().ravel()                  # Target variable extracted from sparse matrix

In [5]:
# Check dimensions of model inputs and labels
print(matrix.shape)
print(x.shape)
print(type(y))
print(np.shape(y))

(116302, 59427)
(116302, 59426)
<class 'numpy.ndarray'>
(116302,)


In [6]:
x_train, x_test, y_train, y_test = train_test_split(
    x,              # Sparse matrix
    y,              # Target column
    test_size=0.2,  # 20% test split
    random_state=42
)

## Modelling

In [7]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

ridge = Ridge(alpha=1.0)  # You can tune alpha later
ridge.fit(x_train, y_train)
y_pred = ridge.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse:.4f}")

aspect_names = columns  # or provide aspect column names directly
coefficients = ridge.coef_

# Get top contributing aspects
top_positive = sorted(zip(coefficients, aspect_names), reverse=True)[:10]
top_negative = sorted(zip(coefficients, aspect_names))[:10]

print("Top Positive Aspects:")
for coef, name in top_positive:
    print(f"{name}: {coef:.4f}")

print("\nTop Negative Aspects:")
for coef, name in top_negative:
    print(f"{name}: {coef:.4f}")

MSE: 1.0407
Top Positive Aspects:
white spelt: 2.9248
sugar milk cream: 2.6948
lunch room: 2.3262
dentynes: 2.2490
envelope box: 2.2178
sea salt vinegar chips: 2.1954
word desription: 2.1648
color kit: 2.1632
pastry: 2.1452
island dressing: 2.1368

Top Negative Aspects:
seasoning line: -2.8558
tuscan blend saute olive oil: -2.6767
tea flavored: -2.6664
packing order: -2.5857
sound quality: -2.5439
plastic cases: -2.3358
maple flavoured syrup: -2.2801
pickle goodness: -2.1957
malt vinegar gilds: -2.1454
pickles olives peppers: -2.1418


In [None]:
aspect_map = dict(zip(columns, coefficients))

In [13]:
for key, value in aspect_map.items():
    print(key, value)

ProductId -9.14440011270837e-08
Helpfulness 0.12255571282925215
Score -1.617895504139797
100 arabica bean 0.0
100 buckwheat 0.3698362960872682
100 cacao chocolate bar 0.3645821005018464
100 colombian 0.24400062559590308
100 colombian medium roast ground coffee 11 ounce cans 0.24400062559590308
100 colombian medium roast ground coffee 11 ounce cans pack melitta 100 colombian medium roast ground coffee 11 ounce cans pack 0.0
100 cranberry juice 0.6175489129370482
100 vegetable juice 0.0
100 whole wheat fig bar 0.0
11 ounce canister 0.09845135293688298
12 18 bars -0.9658366162584822
12 bottles 20 supermarket -1.046198432997391
12 ounce bags 0.49838809208216905
12 ounce bags pack 0.0
12 oz bag 0.0
12 oz cup 0.0
12 packs raspberry tea -0.06798580657762868
12 volt power -0.7144860859393244
13 beans 0.2652362183707803
14oz container 0.0
16 cup sampler 0.21906979425679218
16 ounce bag 0.0
16 ounce bottles 0.0
16 oz mug 0.0
16oz 0.0
17 oz box costs 49 site pack 0.0
1lb box premium saltines 0.47

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse:.4f}")

importances = rf.feature_importances_
sorted_idx = importances.argsort()[::-1]

print("Top Contributing Aspects:")
for idx in sorted_idx[:10]:
    print(f"{aspect_names[idx]}: {importances[idx]:.4f}")

In [None]:
# Instantiate models for training
knn = KNeighborsClassifier(n_neighbors=5, metric="euclidean")
rf  = RandomForestClassifier(n_estimators=30, max_depth=None, min_samples_split=2)

In [79]:
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

         1.0       0.10      0.25      0.14      1586
         2.0       0.06      0.06      0.06      1042
         3.0       0.06      0.06      0.06      1425
         4.0       0.17      0.11      0.13      3090
         5.0       0.72      0.67      0.69     16118

    accuracy                           0.50     23261
   macro avg       0.22      0.23      0.22     23261
weighted avg       0.53      0.50      0.51     23261



In [69]:
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

         1.0       0.50      0.41      0.45      1586
         2.0       0.45      0.23      0.30      1042
         3.0       0.39      0.17      0.24      1425
         4.0       0.45      0.17      0.24      3090
         5.0       0.77      0.94      0.84     16118

    accuracy                           0.72     23261
   macro avg       0.51      0.38      0.41     23261
weighted avg       0.67      0.72      0.68     23261

