In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer


In [44]:

df = pd.read_csv("amazon.csv")

# Assuming df is your DataFrame

# Convert 'discount_percentage' from string to float and handle NaN
df['discount_percentage'] = df['discount_percentage'].str.replace('%', '').astype(float)
df['discount_percentage'].fillna(df['discount_percentage'].mean(), inplace=True)

# Convert price fields from string to float and handle NaN
df['actual_price'] = df['actual_price'].str.replace('₹', '').str.replace(',', '').astype(float)
df['actual_price'].fillna(df['actual_price'].mean(), inplace=True)

df['discounted_price'] = df['discounted_price'].str.replace('₹', '').str.replace(',', '').astype(float)
df['discounted_price'].fillna(df['discounted_price'].mean(), inplace=True)

# Encoding categorical variable 'category' and handle NaN
df['category'] = df['category'].astype(str)  # Ensure it's string type
le = LabelEncoder()
df['category'] = le.fit_transform(df['category'])

# TF-IDF for 'about_product' text field and handle NaN
df['about_product'] = df['about_product'].fillna('')  # Replace NaN with empty string
tfidf = TfidfVectorizer(max_features=100)  # Adjust max_features as needed
tfidf_result = tfidf.fit_transform(df['about_product']).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns=tfidf.get_feature_names_out())
df = pd.concat([df, tfidf_df], axis=1)


In [57]:
features = ['category', 'discount_percentage', 'actual_price', 'discounted_price'] + list(tfidf.get_feature_names_out())
X = df[features]
import pandas as pd
import numpy as np

# ... (other preprocessing steps)

# Replace '|' and other non-numeric characters with empty string
df['rating'] = df['rating'].str.replace('|', '').str.replace('%', '').str.replace('₹', '').str.replace(',', '').str.replace(' ', '')

# Replace empty strings with NaN
df['rating'] = df['rating'].replace('', np.nan)

# Convert 'rating' to float, after handling NaN values
# Option 1: Fill NaN values with the mean or median
df['rating'] = df['rating'].astype(float).fillna(df['rating'].astype(float).mean())

# Option 2: Drop rows with NaN values in 'rating' (use with caution)
# df = df.dropna(subset=['rating'])

# Set y for your Random Forest Model
y = df['rating']

# ... (rest of your model training code)


  df['rating'] = df['rating'].str.replace('|', '').str.replace('%', '').str.replace('₹', '').str.replace(',', '').str.replace(' ', '')


In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

Unnamed: 0,category,discount_percentage,actual_price,discounted_price,10,12,all,also,an,and,...,warranty,watch,water,when,which,will,with,year,you,your
1451,160,32.0,2199.0,1499.00,0.0,0.0,0.0,0.000000,0.000000,0.199809,...,0.364811,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000
1340,136,51.0,1999.0,979.00,0.0,0.0,0.0,0.000000,0.000000,0.045937,...,0.000000,0.0,0.000000,0.241666,0.0,0.0,0.101984,0.0,0.152058,0.068294
254,94,76.0,399.0,96.00,0.0,0.0,0.0,0.000000,0.000000,0.145042,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.161001,0.0,0.000000,0.000000
1167,161,32.0,3945.0,2698.00,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.107661,0.0,0.000000,0.000000
1239,168,20.0,1999.0,1599.00,0.0,0.0,0.0,0.168739,0.000000,0.321843,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.285804,0.0,0.000000,0.287087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,169,48.0,6295.0,3249.00,0.0,0.0,0.0,0.000000,0.120364,0.102757,...,0.187613,0.0,0.000000,0.000000,0.0,0.0,0.114063,0.0,0.085034,0.000000
1294,144,22.0,1282.0,998.06,0.0,0.0,0.0,0.000000,0.000000,0.329245,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000
860,76,62.0,1299.0,499.00,0.0,0.0,0.0,0.000000,0.268676,0.229374,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.381918,0.0,0.000000,0.000000
1459,183,80.0,999.0,199.00,0.0,0.0,0.0,0.088248,0.000000,0.504957,...,0.000000,0.0,0.181812,0.000000,0.0,0.0,0.112104,0.0,0.055715,0.050047


In [59]:
# Assuming df is your DataFrame

for column in df.columns:
    if df[column].apply(lambda x: '|' in str(x)).any():
        print(f"The column '{column}' contains the '|' symbol.")


The column 'product_name' contains the '|' symbol.
The column 'about_product' contains the '|' symbol.
The column 'user_name' contains the '|' symbol.
The column 'review_title' contains the '|' symbol.
The column 'review_content' contains the '|' symbol.


In [60]:
rfr = RandomForestRegressor(n_estimators=100, random_state=42)  # You can tune these parameters
rfr.fit(X_train, y_train)

In [61]:
y_pred = rfr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.06334774059356754
