<a href="https://colab.research.google.com/github/Chiranjeevi141004/Chiranjeevi141004/blob/main/Project_BM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Problem Statement**
The dataset appears to be related to sales prediction for retail products. The goal is likely to predict Item_Outlet_Sales based on product attributes (e.g., Item Type, Weight, MRP) and store attributes (e.g., Outlet Size, Location Type, Establishment Year).

**Column Details**
The dataset has 12 columns in the training set and 11 in the test set (excluding the target variable Item_Outlet_Sales).

Item_Identifier: Unique ID for each product
Item_Weight: Weight of the item
Item_Fat_Content: Whether the product is "Low Fat" or "Regular"
Item_Visibility: The percentage of visibility in the store
Item_Type: Category of the item (e.g., Dairy, Snacks)
Item_MRP: Maximum Retail Price
Outlet_Identifier: Store ID
Outlet_Establishment_Year: Year when the store was established
Outlet_Size: Size of the store (e.g., Small, Medium, High)
Outlet_Location_Type: Tier of the city where the store is located
Outlet_Type: Type of the store (e.g., Grocery Store, Supermarket)
Item_Outlet_Sales (Only in Training Data): Sales for a particular product in a store (Target Variable)

In [None]:
# Step 1: Install and Import Required Libraries
!pip install pandas numpy matplotlib seaborn scikit-learn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# Step 2: Load the Dataset
train_df = pd.read_csv("/content/Train_Data.csv")
test_df = pd.read_csv("/content/Test.csv")

In [None]:
# Step 3: Understand the Data
print("Train Data Shape:", train_df.shape)
print("Test Data Shape:", test_df.shape)
print(train_df.head())
print(train_df.info())
print(train_df.describe())

In [None]:
# Step 4: EDA
## Univariate Analysis
plt.figure(figsize=(10,5))
sns.histplot(train_df["Item_Outlet_Sales"], bins=50, kde=True, color="blue")
plt.title("Distribution of Item_Outlet_Sales")
plt.show()


In [None]:
## Bivariate Analysis
numeric_cols = train_df.select_dtypes(include=["number"])
plt.figure(figsize=(10,5))
sns.heatmap(numeric_cols.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap (Numeric Features Only)")
plt.show()

In [None]:
# Step 5: Check Missing Values
print(train_df.isnull().sum())


In [None]:
# Step 6: Handling Missing Values (if any)
numeric_features = train_df.select_dtypes(include=["number"]).columns
train_df[numeric_features] = train_df[numeric_features].fillna(train_df[numeric_features].median())  # Fill numeric missing values with median

categorical_features = train_df.select_dtypes(include=["object"]).columns
train_df[categorical_features] = train_df[categorical_features].fillna(train_df[categorical_features].mode().iloc[0])  # Fill categorical missing values with mode

In [None]:
# Step 7: Check Duplicates
print("Duplicate Rows: ", train_df.duplicated().sum())
train_df.drop_duplicates(inplace=True)

In [None]:
# Step 8: Outlier Treatment
num_cols = ["Item_Weight", "Item_Visibility", "Item_MRP", "Item_Outlet_Sales"]
plt.figure(figsize=(12,6))
for i, col in enumerate(num_cols, 1):
    plt.subplot(2,2,i)
    sns.boxplot(y=train_df[col], color="skyblue")
    plt.title(f"Boxplot of {col}")
plt.tight_layout()
plt.show()

In [None]:
# Removing Outliers using IQR
for col in num_cols:
    Q1 = train_df[col].quantile(0.25)
    Q3 = train_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    train_df = train_df[(train_df[col] >= lower_bound) & (train_df[col] <= upper_bound)]

In [None]:
# Re-check Outlier Boxplots
plt.figure(figsize=(12,6))
for i, col in enumerate(num_cols, 1):
    plt.subplot(2,2,i)
    sns.boxplot(y=train_df[col], color="lightgreen")
    plt.title(f"Boxplot of {col} (After Outlier Removal)")
plt.tight_layout()
plt.show()


In [None]:
# Step 9: Feature Engineering
## Encoding Categorical Variables
cat_cols = ["Item_Fat_Content", "Outlet_Size", "Outlet_Location_Type", "Outlet_Type", "Item_Type"]

# Create label encoding dictionary for training data
encoding_maps = {}
for col in cat_cols:
    unique_classes = train_df[col].unique()
    encoding_maps[col] = {category: idx for idx, category in enumerate(unique_classes)}
    train_df[col] = train_df[col].map(encoding_maps[col])

# Apply the same mapping for test data, replacing unseen categories with -1
for col in cat_cols:
    test_df[col] = test_df[col].map(encoding_maps[col]).fillna(-1).astype(int)  # Assign -1 to unseen labels


In [None]:
# Step 10: Splitting Features & Target
X = train_df.drop(columns=["Item_Outlet_Sales", "Item_Identifier", "Outlet_Identifier"])
y = train_df["Item_Outlet_Sales"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Step 11: Model Training
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(n_estimators=100),
    "Gradient Boosting": GradientBoostingRegressor(),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "SVM": SVR()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{name} Performance:")
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("MSE:", mean_squared_error(y_test, y_pred))
    print("R2 Score:", r2_score(y_test, y_pred))
    print("----------------------------------")



Linear Regression Performance:
MAE: 811.8118513418711
MSE: 1104699.8896443062
R2 Score: 0.49301455889262913
----------------------------------
Decision Tree Performance:
MAE: 17.848194840295967
MSE: 37127.55564944973
R2 Score: 0.9829608653403274
----------------------------------
Random Forest Performance:
MAE: 32.18860982101038
MSE: 8131.4609152498915
R2 Score: 0.9962681880050764
----------------------------------
Gradient Boosting Performance:
MAE: 673.1794682000622
MSE: 848294.4989484692
R2 Score: 0.61068796623414
----------------------------------
KNN Performance:
MAE: 13.414085487968762
MSE: 7847.477378609878
R2 Score: 0.9963985179887583
----------------------------------
SVM Performance:
MAE: 1153.2931653489431
MSE: 2144841.4665681073
R2 Score: 0.01565718687313622
----------------------------------


In [None]:
# Step 12: Hyperparameter Tuning for Best Model (Random Forest as an Example)
param_grid = {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20]}
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test)
print("Best Model Performance:")
print("MAE:", mean_absolute_error(y_test, y_pred_best))
print("MSE:", mean_squared_error(y_test, y_pred_best))
print("R2 Score:", r2_score(y_test, y_pred_best))
