In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,r2_score,mean_squared_error


In [14]:
test_df=pd.read_csv("Data/test.csv")
test_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


In [5]:
def data_cleaning(df):
  df['Item_Weight'] = df['Item_Weight'].fillna(df['Item_Weight'].mean())
  df['Outlet_Size'] = df['Outlet_Size'].fillna('Small')
  # Replace zero Item_Visibility values with the mean
  df.loc[df["Item_Visibility"] == 0, "Item_Visibility"] = df["Item_Visibility"].mean()
  df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'LF': 'Low Fat', 'reg': 'Regular', 'low fat': 'Low Fat'}) #Standardize Item_Fat_Content labels


  return df
clean_data=data_cleaning(test_df)
clean_data

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.750000,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.300000,Regular,0.038428,Dairy,87.3198,OUT017,2007,Small,Tier 2,Supermarket Type1
2,NCN55,14.600000,Low Fat,0.099575,Others,241.7538,OUT010,1998,Small,Tier 3,Grocery Store
3,FDQ58,7.315000,Low Fat,0.015388,Snack Foods,155.0340,OUT017,2007,Small,Tier 2,Supermarket Type1
4,FDY38,12.695633,Regular,0.118599,Dairy,234.2300,OUT027,1985,Medium,Tier 3,Supermarket Type3
...,...,...,...,...,...,...,...,...,...,...,...
5676,FDB58,10.500000,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Small,Tier 1,Supermarket Type1
5677,FDD47,7.600000,Regular,0.142991,Starchy Foods,169.1448,OUT018,2009,Medium,Tier 3,Supermarket Type2
5678,NCO17,10.000000,Low Fat,0.073529,Health and Hygiene,118.7440,OUT045,2002,Small,Tier 2,Supermarket Type1
5679,FDJ26,15.300000,Regular,0.065684,Canned,214.6218,OUT017,2007,Small,Tier 2,Supermarket Type1


In [6]:
def creating_new_features(df):
    df['Outlet_Age'] = 2025 - df['Outlet_Establishment_Year']  # Create new feature Outlet_Age
    df['estimated_sales_per_year'] = (df['Item_MRP'] * (1 - df['Item_Visibility'])) / (df['Outlet_Age'] + 1)  # Avoid division by zero
    df['Price_Per_Weight'] = df['Item_MRP'] / (df['Item_Weight'] + 1)  # +1 to avoid division by zero

    # Interaction Features
    df["MRP_per_Outlet_Age"] = df["Item_MRP"] / (df["Outlet_Age"] + 1)  # Avoid division by zero
    df["MRP_Visibility_Interaction"] = df["Item_MRP"] * df["Item_Visibility"]
    df["Weight_Visibility_Interaction"] = df["Item_Weight"] * df["Item_Visibility"]

    return df

# Apply function to the dataset
new_features_data = creating_new_features(clean_data)

# Display the first few rows
new_features_data.head()


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Outlet_Age,estimated_sales_per_year,Price_Per_Weight,MRP_per_Outlet_Age,MRP_Visibility_Interaction,Weight_Visibility_Interaction
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1,26,3.964676,4.959182,3.994896,0.81596,0.15697
1,FDW14,8.3,Regular,0.038428,Dairy,87.3198,OUT017,2007,Small,Tier 2,Supermarket Type1,18,4.419174,9.389226,4.595779,3.355497,0.31895
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,Small,Tier 3,Grocery Store,27,7.774328,15.497038,8.634064,24.072612,1.453794
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,Small,Tier 2,Supermarket Type1,18,8.03412,18.645099,8.159684,2.385724,0.112566
4,FDY38,12.695633,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3,40,5.035378,17.102531,5.712927,27.779517,1.505693


In [8]:
new_features_data.drop(["Item_Identifier"], axis=1, inplace=True)
new_features_data.shape


(5681, 16)

In [9]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler,power_transform

import pandas as pd

def encoding_and_standardized_data(df):
    # Identify categorical and numerical features
    categorical_features = df.select_dtypes(include='object').columns
    numerical_features = df.select_dtypes(exclude='object').columns

    # Separate features for LabelEncoder and OneHotEncoder
    OneHotEncoder_list = []  # List for OneHotEncoder
    LabelEncoder_list = []   # List for LabelEncoder

    # Identify categorical columns
    for col in categorical_features:
        if df[col].nunique() < 5:
            LabelEncoder_list.append(col)
        else:
            OneHotEncoder_list.append(col)

    # Apply OneHotEncoder to the specified columns
    one = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_data = one.fit_transform(df[OneHotEncoder_list])

    # Get feature names for the encoded data
    feature_names = one.get_feature_names_out(OneHotEncoder_list)

    # Create a DataFrame for the encoded data
    encoded_df = pd.DataFrame(encoded_data, columns=feature_names, index=df.index)

    # Concatenate the encoded DataFrame with the original DataFrame
    df = pd.concat([df, encoded_df], axis=1)

    # Apply LabelEncoder to the specified columns
    label = LabelEncoder()
    for col in LabelEncoder_list:
        df[col] = df[col].fillna("Unknown")  # Handle missing values
        df[col] = label.fit_transform(df[col])

    # Drop the original categorical columns that were OneHotEncoded
    df.drop(OneHotEncoder_list, axis=1, inplace=True)

    # Standardize numerical features
    # power_t=power_transform(df[numerical_features])
    # df[numerical_features]=power_t


    return df

In [10]:
test_df_encodided=encoding_and_standardized_data(new_features_data)
test_df_encodided.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Outlet_Age,estimated_sales_per_year,...,Outlet_Identifier_OUT010,Outlet_Identifier_OUT013,Outlet_Identifier_OUT017,Outlet_Identifier_OUT018,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049
0,20.75,0,0.007565,107.8622,1999,1,0,1,26,3.964676,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,8.3,1,0.038428,87.3198,2007,2,1,1,18,4.419174,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,14.6,0,0.099575,241.7538,1998,2,2,0,27,7.774328,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7.315,0,0.015388,155.034,2007,2,1,1,18,8.03412,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12.695633,1,0.118599,234.23,1985,1,2,3,40,5.035378,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [12]:
import pickle
best_rf = pickle.load(open("model_rf.pkl", "rb"))
pred=best_rf.predict(test_df_encodided)
pred

array([1629.81823586, 1308.5910575 ,  515.21587489, ..., 1930.78108359,
       3964.16472073, 1404.02389   ])

In [15]:

# Create a copy of the original test DataFrame to preserve the dropped columns
test_df_with_ids = test_df.copy()

# Create a DataFrame with Item_Identifier, Outlet_Identifier, and predictions
submission_df = pd.DataFrame({
    'Item_Identifier': test_df_with_ids['Item_Identifier'],  # Access from the copy
    'Outlet_Identifier': test_df_with_ids['Outlet_Identifier'],  # Access from the copy
    'Item_Outlet_Sales': pred  # Assuming 'pre' contains your model predictions
})

# Display the DataFrame
submission_df

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Outlet_Sales
0,FDW58,OUT049,1629.818236
1,FDW14,OUT017,1308.591057
2,NCN55,OUT010,515.215875
3,FDQ58,OUT017,2293.823861
4,FDY38,OUT027,6233.136221
...,...,...,...
5676,FDB58,OUT046,1998.631252
5677,FDD47,OUT018,2484.796723
5678,NCO17,OUT045,1930.781084
5679,FDJ26,OUT017,3964.164721
