In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv("/kaggle/input/dataset-csv/Dataset .csv") 

# Drop rows with missing values
df.dropna(inplace=True)

# Encode categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Define features and target
X = df.drop('Aggregate rating', axis=1)
y = df['Aggregate rating']

# Apply MinMax scaling to features
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

# Show predictions alongside actual values
results_df = X_test.copy()
results_df['Actual Rating'] = y_test.values
results_df['Predicted Rating'] = y_pred
print(results_df[['Actual Rating', 'Predicted Rating']].head(10))


Mean Squared Error: 0.027872523834468314
R2 Score: 0.9878287633594838
      Actual Rating  Predicted Rating
7133            0.0             0.000
1851            4.1             4.136
4416            3.3             3.061
3210            3.9             3.598
8299            0.0             0.000
5254            2.9             2.863
6086            0.0             0.000
2287            4.3             4.178
2771            3.5             3.650
8760            0.0             0.000
