In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Show plots in the notebook
%matplotlib inline

# Scikit-learn Modules
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler

# XGBoost
from xgboost import XGBRegressor

# Ignore Warnings
import warnings
warnings.filterwarnings('ignore')


In [None]:
df= pd.read_csv("/kaggle/input/amazon-sales-dataset/amazon.csv")

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:

# Convert to string first, then remove ₹ and commas
df['discounted_price'] = df['discounted_price'].astype(str).str.replace('₹', '').str.replace(',', '')
df['actual_price'] = df['actual_price'].astype(str).str.replace('₹', '').str.replace(',', '')

# Convert to float safely
df['discounted_price'] = pd.to_numeric(df['discounted_price'], errors='coerce')
df['actual_price'] = pd.to_numeric(df['actual_price'], errors='coerce')

# Remove % from discount and convert to float
df['discount_percentage'] = df['discount_percentage'].astype(str).str.replace('%', '')
df['discount_percentage'] = pd.to_numeric(df['discount_percentage'], errors='coerce')

# Convert rating to float safely
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

# Remove commas from rating_count and convert to int safely
df['rating_count'] = df['rating_count'].astype(str).str.replace(',', '')
df['rating_count'] = pd.to_numeric(df['rating_count'], errors='coerce')


In [None]:

# Fill missing numeric values with median
df['rating'].fillna(df['rating'].median(), inplace=True)
df['rating_count'].fillna(df['rating_count'].median(), inplace=True)

#  Verify changes
df.info()
df.head()

In [None]:
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
df.describe()

In [None]:

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

sns.histplot(df['discounted_price'], kde=True, ax=axes[0,0])
axes[0,0].set_title("Distribution of Discounted Price")

sns.histplot(df['actual_price'], kde=True, ax=axes[0,1])
axes[0,1].set_title("Distribution of Actual Price")

sns.histplot(df['discount_percentage'], kde=True, ax=axes[1,0])
axes[1,0].set_title("Distribution of Discount Percentage")

sns.histplot(df['rating'], kde=True, ax=axes[1,1])
axes[1,1].set_title("Distribution of Ratings")

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(y=df['rating'], order=df['rating'].value_counts().index)
plt.title("Count of Products by Rating")
plt.show()

In [None]:

plt.figure(figsize=(10,6))
sns.scatterplot(x="actual_price", y="rating", data=df, alpha=0.6)
plt.title("Actual Price vs Rating")
plt.xlabel("Actual Price (₹)")
plt.ylabel("Rating")
plt.show()

In [None]:

plt.figure(figsize=(8,6))
corr = df[['discounted_price','actual_price','discount_percentage','rating','rating_count']].corr()
sns.heatmap(corr, annot=True,  fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

In [None]:

plt.figure(figsize=(14,6))
rating_counts = df['rating'].value_counts(normalize=True) * 100
sns.barplot(x=rating_counts.index, y=rating_counts.values)
plt.ylabel("Percentage (%)")
plt.xlabel("Rating")
plt.title("Share of Products by Rating")
plt.show()


In [None]:

plt.figure(figsize=(16,12))
top_categories = df['category'].value_counts().nlargest(10)
sns.barplot(x=top_categories.values, y=top_categories.index)
plt.title("Top 10 Product Categories by Count")
plt.xlabel("Number of Products")
plt.ylabel("Category")
plt.show()

In [None]:

# 2. Encoding Categorical Variables
categorical_cols = df.select_dtypes(include=['object']).columns

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

In [None]:

X = df.drop(columns=["discounted_price"])   # Drop the Target Features
y = df["discounted_price"]                  # Target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

In [None]:
# Function to evaluate the models
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return {"RMSE": rmse, "MAE": mae, "R2": r2}


In [None]:


# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42),
    "XGBoost Regressor": XGBRegressor(random_state=42, n_estimators=200, learning_rate=0.1)
}

# Evaluate each model
results = {}
for name, model in models.items():
    scores = evaluate_model(model, X_train, y_train, X_test, y_test)
    results[name] = scores

# Convert results to DataFrame
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values(by="RMSE", ascending=True)
display(results_df)

In [None]:
# Plot RMSE and MAE
plt.figure(figsize=(10,5))
sns.barplot(data=results_df[['RMSE','MAE']].reset_index().melt(id_vars='index'),
            x='index', y='value', hue='variable', palette='coolwarm')
plt.title("Model Comparison (RMSE & MAE)", fontsize=15, fontweight='bold')
plt.xlabel("Models")
plt.ylabel("Error Value")
plt.xticks(rotation=15)
plt.legend(title="Metrics")
plt.show()

# Plot R² separately
plt.figure(figsize=(10,5))
sns.barplot(data=results_df[['R2']].reset_index(),
            x='index', y='R2', palette='crest')
plt.title("Model Comparison (R² Score)", fontsize=15, fontweight='bold')
plt.xlabel("Models")
plt.ylabel("R² Score")
plt.xticks(rotation=15)
plt.show()

In [None]:
# Identify best model based on lowest RMSE
best_model_name = results_df["RMSE"].idxmin()
print(f" Best Model: {best_model_name}")
      
# Retrieve the best model object
best_model = models[best_model_name]
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n Model Performance on Test Data")
print(f"RMSE: {rmse:.4f}")
print(f"MAE:  {mae:.4f}")
print(f"R²:   {r2:.4f}")

# Plot: Actual vs Predicted
plt.figure(figsize=(8,6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.7, color="#00b894")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.title(f"{best_model_name} - Actual vs Predicted", fontsize=15, fontweight='bold')
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.show()

# Residual Plot
residuals = y_test - y_pred
plt.figure(figsize=(8,5))
sns.histplot(residuals, bins=30, kde=True, color="#0984e3")
plt.title(f"{best_model_name} - Residual Distribution", fontsize=15, fontweight='bold')
plt.xlabel("Prediction Error (Residuals)")
plt.ylabel("Frequency")
plt.show()
