# Single Aspect Modelling

In [5]:
'''
Import required packages and libraries for multi-aspect modelling
'''
import joblib
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [6]:
PATH = "../cleaning/single_aspect.pkl"

data    = joblib.load(PATH)
matrix  = data["matrix"]
columns = data["columns"]

In [7]:
first_row_coo = matrix[0].tocoo()
for i, v in zip(first_row_coo.col, first_row_coo.data):
    print(f"Column: {columns[i]}, Value: {v}, Column Index: {i}")

Column: Score, Value: 1.0, Column Index: 2
Column: sized, Value: -1.0, Column Index: 47964


## Training Testing Split

In [8]:
# Target variable we are trying to predict
target_col = columns.index("Score")

x = matrix[:, [i for i in range(matrix.shape[1]) if i != target_col]]   # Remove target variable from sparse matrix
y = matrix[:,columns.index("Score")].toarray().ravel()                  # Target variable extracted from sparse matrix

In [9]:
# Check dimensions of model inputs and labels
print(matrix.shape)
print(x.shape)
print(type(y))
print(np.shape(y))

(87239, 59427)
(87239, 59426)
<class 'numpy.ndarray'>
(87239,)


In [10]:
x_train, x_test, y_train, y_test = train_test_split(
    x,              # Sparse matrix
    y,              # Target column
    test_size=0.2,  # 20% test split
    random_state=42
)

## Training Testing Split

In [11]:
# Target variable we are trying to predict
target_col = columns.index("Score")

x = matrix[:, [i for i in range(matrix.shape[1]) if i != target_col]]   # Remove target variable from sparse matrix
y = matrix[:,columns.index("Score")].toarray().ravel()                  # Target variable extracted from sparse matrix

In [12]:
# Check dimensions of model inputs and labels
print(matrix.shape)
print(x.shape)
print(type(y))
print(np.shape(y))

(87239, 59427)
(87239, 59426)
<class 'numpy.ndarray'>
(87239,)


In [21]:
x_train, x_test, y_train, y_test = train_test_split(
    x,              # Sparse matrix
    y,              # Target column
    test_size=0.2,  # 20% test split
    random_state=42
)

## Modelling

In [22]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

ridge = Ridge(alpha=1.0)  # You can tune alpha later
ridge.fit(x_train, y_train)
y_pred = ridge.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse:.4f}")

aspect_names = columns  # or provide aspect column names directly
coefficients = ridge.coef_

# Get top contributing aspects
top_positive = sorted(zip(coefficients, aspect_names), reverse=True)[:10]
top_negative = sorted(zip(coefficients, aspect_names))[:10]

print("Top Positive Aspects:")
for coef, name in top_positive:
    print(f"{name}: {coef:.4f}")

print("\nTop Negative Aspects:")
for coef, name in top_negative:
    print(f"{name}: {coef:.4f}")

MSE: 1.7399
Top Positive Aspects:
time shipment: 2.5835
meat type: 2.5634
costcos: 2.5436
bacon milkshakes: 2.4638
food internet: 2.4582
licorice jelly bean: 2.4178
keurig cup: 2.3693
green mountain breafast: 2.3690
bark bite chips: 2.3688
advertisement: 2.3631

Top Negative Aspects:
sour mix egg: -2.4646
amino liquid: -2.2722
vegetable oil: -2.2168
chocolate rum cake: -2.2152
brown water chicken soup: -2.2150
sound quality: -2.2118
flyer: -2.2032
blueberry breakfast bar: -2.2028
cuitlacoche corn smut corn fungus: -2.1703
lemonade fruit punch: -2.0745


In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse:.4f}")

importances = rf.feature_importances_
sorted_idx = importances.argsort()[::-1]

print("Top Contributing Aspects:")
for idx in sorted_idx[:10]:
    print(f"{aspect_names[idx]}: {importances[idx]:.4f}")

In [None]:
# Instantiate models for training
knn = KNeighborsClassifier(n_neighbors=5, metric="euclidean")
rf  = RandomForestClassifier(n_estimators=30, max_depth=None, min_samples_split=2)

In [24]:
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

         1.0       0.20      0.18      0.19      2138
         2.0       0.09      0.04      0.05      1021
         3.0       0.09      0.05      0.06      1353
         4.0       0.15      0.10      0.12      2219
         5.0       0.64      0.78      0.70     10717

    accuracy                           0.52     17448
   macro avg       0.23      0.23      0.22     17448
weighted avg       0.45      0.52      0.48     17448



In [25]:
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

         1.0       0.30      0.29      0.30      2138
         2.0       0.14      0.14      0.14      1021
         3.0       0.15      0.14      0.15      1353
         4.0       0.19      0.18      0.19      2219
         5.0       0.69      0.71      0.70     10717

    accuracy                           0.51     17448
   macro avg       0.30      0.29      0.29     17448
weighted avg       0.51      0.51      0.51     17448

