In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool

In [2]:
sales_data = pd.read_csv("train_v9rqX0R.csv")
validation_data = pd.read_csv("test_AbJTz2l.csv")

In [3]:
#Feature Engineering

sales_data['Outlet_Age'] = 2025 - sales_data['Outlet_Establishment_Year']
sales_data['Item_Visibility'] = sales_data['Item_Visibility']*100
sales_data['Item_Fat_Content'] = sales_data['Item_Fat_Content'].replace({
    'low fat': 'Low Fat',
    'LF': 'Low Fat',
    'reg': 'Regular'
})


In [4]:
categorical_cols = sales_data.select_dtypes(include=['object', 'category']).columns.tolist()
print("Categorical Columns:", categorical_cols)

Categorical Columns: ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']


In [5]:

def impute_outlet_size(row):
    if pd.isna(row['Outlet_Size']) or row['Outlet_Size'] == 'Missing':
        year = row['Outlet_Establishment_Year']
        otype = row['Outlet_Type']
        loc = row['Outlet_Location_Type']
        
        if year == 1987:
            return 'High'
        elif otype == 'Supermarket Type3':
            return 'Medium'
        elif otype == 'Grocery Store':
            return 'Small'
        elif loc == 'Tier 2':
            return 'Small'
        elif year in [1997, 2004]:
            return 'Small'
        elif year in [1999, 2009]:
            return 'Medium'
        elif year == 1985:
            if otype == 'Supermarket Type1':
                return 'Medium'
            elif otype == 'Grocery Store':
                return 'Small'
    else:
        return row['Outlet_Size']
    
sales_data['Outlet_Size'] = sales_data.apply(impute_outlet_size, axis=1)

# Replace missing Item_Weight values with the mean weight of the corresponding Item_Identifier
sales_data['Item_Weight'] = sales_data.groupby('Item_Identifier')['Item_Weight'].transform(lambda x: x.fillna(x.mean()))
sales_data['Item_Weight'] = sales_data.groupby('Item_Type')['Item_Weight'].transform(lambda x: x.fillna(x.mean()))
sales_data = sales_data.drop(["Outlet_Establishment_Year"], axis = 1)


In [6]:
sales_data.Item_Weight.isna().sum()

0

In [7]:
sales_data.Outlet_Size.isna().sum()

0

In [8]:
categorical_cols = sales_data.select_dtypes(include=['object', 'category']).columns.tolist()
print("Categorical Columns:", categorical_cols)

Categorical Columns: ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']


In [9]:
X_train = sales_data.drop(['Item_Outlet_Sales'], axis=1)
y_train = sales_data['Item_Outlet_Sales']

In [10]:
# initialize the model
cat_model = CatBoostRegressor(iterations=1000, learning_rate=0.01, depth=6,\
                          loss_function='RMSE' ,cat_features=list(categorical_cols),nan_mode='Min')

cat_model.fit(
    X_train, 
    y_train.values.ravel(),
    logging_level='Silent',
)



<catboost.core.CatBoostRegressor at 0x1b277bf7310>

In [11]:
# TESTING ON VALIDATION DATA

In [12]:
id_column = validation_data[['Item_Identifier', 'Outlet_Identifier']]
validation_data['Outlet_Age'] = 2025 - validation_data['Outlet_Establishment_Year']
validation_data['Item_Visibility'] = validation_data['Item_Visibility']*100
validation_data['Item_Fat_Content'] = validation_data['Item_Fat_Content'].replace({
    'low fat': 'Low Fat',
    'LF': 'Low Fat',
    'reg': 'Regular'
})

validation_data['Outlet_Size'] = validation_data.apply(impute_outlet_size, axis=1)

validation_data['Item_Weight'] = validation_data.groupby('Item_Identifier')['Item_Weight'].transform(lambda x: x.fillna(x.mean()))
validation_data['Item_Weight'] = validation_data.groupby('Item_Type')['Item_Weight'].transform(lambda x: x.fillna(x.mean()))
validation_data = validation_data.drop(["Outlet_Establishment_Year"], axis = 1)



In [13]:
cat_model.get_feature_importance(prettified=True)


Unnamed: 0,Feature Id,Importances
0,Item_MRP,50.967631
1,Outlet_Identifier,21.753321
2,Outlet_Type,17.044573
3,Outlet_Location_Type,2.051171
4,Outlet_Size,1.990674
5,Outlet_Age,1.653026
6,Item_Type,1.602072
7,Item_Identifier,1.316079
8,Item_Weight,0.890299
9,Item_Visibility,0.487684


## Selecting Item_MRP, Outlet_Identifier, Outlet_Type, Outlet_Location_Type for model training