In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Load and Clean Data
from google.colab import files
uploaded = files.upload()
df = pd.read_csv("amazon.csv")
df.head

def clean_currency(x):
    if isinstance(x, str):
        x = x.replace('â‚¹', '').replace(',', '')
        try:
            return 0.011 * float(x)
        except ValueError:
            return np.nan
    return x

df['discounted_price'] = df['discounted_price'].apply(clean_currency)
df['actual_price'] = df['actual_price'].apply(clean_currency)
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

df_clean = df.dropna(subset=['discounted_price', 'actual_price', 'rating', 'product_name', 'about_product', 'category'])

# Feature Engineering
# Combine Title and Description
df_clean['combined_text'] = df_clean['product_name'] + " " + df_clean['about_product']

# Extract "Main Category" (first category from long string, such as "Computers&Accessories")
# This reduces the number of unique categories to a manageable set (e.g., Electronics, Home&Kitchen)
df_clean['main_category'] = df_clean['category'].astype(str).apply(lambda x: x.split('|')[0])

# Define inputs, outputs, and splits
X = df_clean[['combined_text', 'main_category']]
y_rating = df_clean['rating']
y_actual_price = df_clean['actual_price']
y_discounted_price = df_clean['discounted_price']

X_train, X_test, y_rating_train, y_rating_test, \
y_actual_train, y_actual_test, \
y_disc_train, y_disc_test = train_test_split(
    X, y_rating, y_actual_price, y_discounted_price, test_size=0.2, random_state=42
)

# Preprocessing Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        # Apply TF-IDF to the text column
        ('text', TfidfVectorizer(stop_words='english', max_features=5000), 'combined_text'),
        # Apply One-Hot Encoding to the categorical column
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['main_category'])
    ]
)

# Create Models
rf_rating_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=50, random_state=42))
])

rf_actual_price_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=50, random_state=42))
])

rf_discount_price_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=50, random_state=42))
])

# Train Models
print("Training models...")
rf_rating_model.fit(X_train, y_rating_train)
rf_actual_price_model.fit(X_train, y_actual_train)
rf_discount_price_model.fit(X_train, y_disc_train)

# Prediction Function accepting Category
def predict_product_v2(title, description, category):
    # Create a single-row DataFrame with the same structure as training data
    input_data = pd.DataFrame({
        'combined_text': [title + " " + description],
        'main_category': [category]
    })

    return {
        "Predicted Rating": round(rf_rating_model.predict(input_data)[0], 1),
        "Estimated Actual Price": round(rf_actual_price_model.predict(input_data)[0], 2),
        "Estimated Discounted Price": round(rf_discount_price_model.predict(input_data)[0], 2)
    }


# Model Evaluation
def print_model_metrics(model_name, y_test, y_pred, unit=""):
    """
    Prints a professional performance report for a regression model.
    """
    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    # Print report
    print(f"{model_name} Performance")
    print(f"R-squared (Variance Explained): {r2:.3f}")
    print(f"Mean Absolute Error (Average Miss): {unit}{mae:,.2f}")
    print(f"Root Mean Squared Error (Large Miss Penalty): {unit}{rmse:,.2f}")

# Predictions for the Test Set
y_rating_pred = rf_rating_model.predict(X_test)
y_actual_pred = rf_actual_price_model.predict(X_test)
y_disc_pred = rf_discount_price_model.predict(X_test)

# Print Reports
print_model_metrics("Product Rating Model", y_rating_test, y_rating_pred, unit="")
print_model_metrics("Actual Price Model", y_actual_test, y_actual_pred, unit="$")
print_model_metrics("Discount Price Model", y_disc_test, y_disc_pred, unit="$")

# Example Usage (User inputs: Title, Description, and Category)
new_title = "4K Smart LED TV 55 Inch"
new_desc = "Ultra HD display with cinematic sound and multiple HDMI ports."
new_cat = "Electronics"

result = predict_product_v2(new_title, new_desc, new_cat)
print("\nPrediction Result")
print(result)

Saving amazon.csv to amazon (1).csv
Training models...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['combined_text'] = df_clean['product_name'] + " " + df_clean['about_product']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['main_category'] = df_clean['category'].astype(str).apply(lambda x: x.split('|')[0])


Product Rating Model Performance
R-squared (Variance Explained): 0.219
Mean Absolute Error (Average Miss): 0.16
Root Mean Squared Error (Large Miss Penalty): 0.25
Actual Price Model Performance
R-squared (Variance Explained): 0.875
Mean Absolute Error (Average Miss): $15.83
Root Mean Squared Error (Large Miss Penalty): $37.16
Discount Price Model Performance
R-squared (Variance Explained): 0.882
Mean Absolute Error (Average Miss): $9.36
Root Mean Squared Error (Large Miss Penalty): $24.86

Prediction Result
{'Predicted Rating': np.float64(4.1), 'Estimated Actual Price': np.float64(36.03), 'Estimated Discounted Price': np.float64(35.21)}
