In [4]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the data
df = pd.read_csv('./data/reviews_with_sentiment.csv')

# Check and clean the data types
df['upvotes'] = pd.to_numeric(df['upvotes'], errors='coerce')  # Convert non-numeric to NaN
df['upvotes'] = df['upvotes'].fillna(0)  # Replace NaN with 0 or some other sensible default
df['year'] = pd.to_numeric(df['year'], errors='coerce')  # Just to be safe
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')  # Just to be safe

# Continue with the clustering as before
features = df[['upvotes', 'year', 'sentiment']]

# Standardize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Define the K-Means model
kmeans = KMeans(n_clusters=5, random_state=42)

# Fit the model to the scaled features
kmeans.fit(features_scaled)
# Assign the cluster labels to the original dataframe
df['cluster'] = kmeans.labels_

# Compute the mean on specific columns only
cluster_means = df.groupby('cluster')[['upvotes', 'year', 'sentiment']].mean()

# Output the cluster means
print(cluster_means)

# If you want to save the dataframe with the cluster labels to a new CSV file
# Make sure to replace 'path_to_save_clustered_data.csv' with your actual desired path
df.to_csv('./data/clustered_data.csv', index=False)


             upvotes    year  sentiment
cluster                                
0        1306.250000  2021.0   0.288291
1          47.476077  2021.0   0.244526
2          25.151961  2021.0   0.441366
3          46.980892  2021.0  -0.023880
4          19.963768  2021.0   0.692652


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Assume df is your DataFrame with the 'cluster' labels and other features

# Select features for the classification - exclude text and other non-numeric columns
X = df[['upvotes', 'sentiment']]  # You can add more features here
y = df['cluster']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.99      1.00      0.99        87
           2       0.97      0.99      0.98        69
           3       1.00      1.00      1.00        38
           4       1.00      0.93      0.96        29

    accuracy                           0.99       230
   macro avg       0.99      0.98      0.99       230
weighted avg       0.99      0.99      0.99       230

