In [17]:
import pandas as pd 

In [18]:
df = pd.read_csv('../DATASETS/preprocessed_text.csv')

In [19]:
# Fill NaN values with an empty string
df.fillna('', inplace=True)

In [20]:
df.head()

Unnamed: 0,content,score,content_cleaned
0,Plsssss stoppppp giving screen limit like when...,2,plss stopp giving screen limit like when you a...
1,Good,5,good
2,👍👍,5,thumbs_up
3,Good,3,good
4,"App is useful to certain phone brand ,,,,it is...",1,app is useful to certain phone brand it is not...


In [21]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [22]:
# Step 1: Vectorize the 'content_cleaned' using Bag-of-Words
vectorizer = CountVectorizer()  # Limit to top 10,000 features
X_bow = vectorizer.fit_transform(df['content_cleaned'])

In [23]:
# Step 2: Normalize the features using MinMaxScaler
scaler = MaxAbsScaler()
X_normalized = scaler.fit_transform(X_bow)

In [24]:
# Step 3: Prepare and normalize the target variable (score)
y = df['score'].values.reshape(-1, 1)
y_scaler = MinMaxScaler(feature_range=(1, 5))  # Scale between 1 and 5 (the range of the scores)
y_normalized = y_scaler.fit_transform(y)

In [25]:
# Step 4: Split data into training and test sets
X_train, X_test, y_train, y_test, review_train, review_test = train_test_split(
    X_normalized, y_normalized, df['content_cleaned'], test_size=0.2, random_state=42
)

In [26]:
# Step 5: Train a regression model (Ridge Regression in this case)
model = Ridge()
model.fit(X_train, y_train)

In [27]:
# Step 6: Make predictions on the test set
y_pred = model.predict(X_test)

In [28]:
# Step 7: Inverse transform the predictions and actual test values
y_pred_original = y_scaler.inverse_transform(y_pred)
y_test_original = y_scaler.inverse_transform(y_test)

In [29]:
# Step 8: Clip predictions to stay within the 1-5 range
y_pred_original_clipped = np.clip(y_pred_original, 1, 5)

In [30]:
# Step 9: Evaluate the model using mean squared error
mse = mean_squared_error(y_test_original, y_pred_original_clipped)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 1.3063148663941409


In [31]:
# Step 10: Print some prediction examples along with review content
print("\nExample predictions:")
for i in range(20, 40):
    print(f"Review {i+1}:")
    print(f"Content: {review_test.iloc[i]}")
    print(f"Predicted score = {y_pred_original_clipped[i][0]:.2f}, Actual score = {y_test_original[i][0]:.2f}\n")


Example predictions:
Review 21:
Content: i think it is a best streaming experience for watch video
Predicted score = 4.01, Actual score = 5.00

Review 22:
Content: they are getting way to expensive for what the app is their competitors have a lot more affordable options both my mom and i are going to unsubscribe from netflix if they keep increasing their prices their content is not that great and when we first got it 9 99 thinking about replacing it with disney
Predicted score = 2.29, Actual score = 1.00

Review 23:
Content: i am very mad at netflix because it is the resean we do not have mcsm it bought mcsm that is the only reason but mcsm was my favorite game to play so i am very mad at netflix for taking that away
Predicted score = 2.52, Actual score = 1.00

Review 24:
Content: app is fine and works as expected it would be nice to have a sleep timer in the app that can be set based on time or episodes when watching on tv there was a are you still there and would turn off if you did

In [34]:
# Saving the model
import joblib

# Save the vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')

# Save the model and scalers as well
joblib.dump(model, 'model.pkl')
joblib.dump(scaler, 'maxabs_scaler.pkl')
joblib.dump(y_scaler, 'minmax_scaler.pkl')

print("Model and scalers saved successfully.")

Model and scalers saved successfully.
