In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random, math


from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error, r2_score, mean_squared_error, make_scorer
from sklearn.ensemble import RandomForestRegressor

### Data loading

In [None]:
df_path = '/home/pc/Desktop_linux/chinu/big_mart_sales_prediction/train_v9rqX0R.csv'
unseen_path = '/home/pc/Desktop_linux/chinu/big_mart_sales_prediction/test_AbJTz2l.csv'

df = pd.read_csv(df_path)
unseen_df = pd.read_csv(unseen_path)
print(df.head())
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [None]:
df.info()

### Missing Values

In [None]:
df.isna().sum()

In [None]:
unseen_df.isna().sum()

### Create Item weights DF for each item

In [None]:
df_weights = df[['Item_Identifier', 'Item_Weight']].dropna(axis=0)
df_weights = df_weights.groupby(by='Item_Identifier')['Item_Weight'].apply(lambda x: x.mode().iloc[0]).reset_index()
df_weights.head()

df_Item_Identifier = df.groupby(by='Item_Identifier')['Item_Outlet_Sales'].mean().reset_index()
# df_weights.head(), df_Item_Identifier.head()

### Data Preprocessing 

In [None]:
GROCERY_STORE_SIZE = df[df['Outlet_Type'] == 'Grocery Store']['Outlet_Size'].dropna().unique()[0]
LOCATION_TIER_2_SIZE = df[df['Outlet_Location_Type'] == 'Tier 2']['Outlet_Size'].dropna().unique()[0]


def replace_missing_values(row):
    if pd.isna(row['Item_Weight']):
        mode_val = df_weights.loc[df_weights['Item_Identifier'] == row['Item_Identifier'], 'Item_Weight'] 
        if not mode_val.empty:
            row['Item_Weight'] = mode_val.iloc[0]
        else:
            print(row['Item_Identifier'])
    
    if pd.isna(row['Outlet_Size']):
        if row['Outlet_Type'] == 'Grocery Store':
            row['Outlet_Size'] = GROCERY_STORE_SIZE
        elif row['Outlet_Location_Type'] == 'Tier 2':
            row['Outlet_Size'] = LOCATION_TIER_2_SIZE
        else:
            print('NaN is as it is !!', row['Outlet_Identifier'], row['Outlet_Location_Type'], row['Outlet_Type'])

    # assign 1 for low fat else 0
    my_dict = {'Low Fat' : 1,
               'Regular' : 0,
               'LF' : 1,
               'reg' : 0,
               'low fat' : 1}
    row['Item_Fat_Content'] = my_dict[row['Item_Fat_Content']]


    # Feature enginnering for establish_year
    row['Outlet_Total_Years'] = int(2013 - row['Outlet_Establishment_Year'])

    # Handle Visibility feature
    row['Item_Vis_Log'] = math.log(row['Item_Visibility'] * 1000 + 1)

    # Target Encoding for Item_Identifier
    row['Item_Identifier_encoded'] = df_Item_Identifier.loc[df_Item_Identifier['Item_Identifier'] == row['Item_Identifier'], 'Item_Outlet_Sales'].iloc[0]
    return row


df_clean = df.apply(replace_missing_values, axis=1)
unseen_df_clean = unseen_df.apply(replace_missing_values, axis=1)

df_clean['Item_Weight'] = df_clean['Item_Weight'].fillna(df_clean['Item_Weight'].mode().iloc[0])
unseen_df_clean['Item_Weight'] = unseen_df_clean['Item_Weight'].fillna(df_clean['Item_Weight'].mode().iloc[0])

df_clean.isna().sum()


### Target Encoding with KFOLd for 'Item_Identifier'

In [None]:
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
# df_clean['Item_Identifier_encoded'] = np.nan

# for i, (train_index, val_index) in enumerate(kf.split(df_clean)):
#     train_df = df_clean.iloc[train_index].copy()  # Create copies to avoid SettingWithCopyWarning
#     val_df = df_clean.iloc[val_index].copy()    # Create copies to avoid SettingWithCopyWarning

#     means = train_df.groupby('Item_Identifier')['Item_Outlet_Sales'].mean().round(2)

#     # The fix: Use the indices from val_df directly
#     df_clean.loc[val_df.index, 'Item_Identifier_encoded'] = val_df['Item_Identifier'].map(means)


# # Calculate mean for overall df_clean
# means = df_clean.groupby(by='Item_Identifier')['Item_Outlet_Sales'].mean().round(2)
# df_clean['Item_Identifier_encoded'] = df_clean['Item_Identifier_encoded'].fillna(df_clean['Item_Outlet_Sales'].mean().round(2))
# unseen_df_clean['Item_Identifier_encoded'] = unseen_df_clean['Item_Identifier'].map(means)

### One hot encoding for Item_Type, Outlet_Identifier, Outlet_Size, Outlet_Location_Type, Outlet_Type    

In [None]:
def one_hot_encoding(temp_df):
    dummies = pd.get_dummies(temp_df[['Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']], dtype=int)
    new_df = pd.concat([temp_df, dummies], axis=1)
    new_df = new_df.drop(['Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Outlet_Establishment_Year'], axis=1)
    return new_df


df_clean = one_hot_encoding(df_clean)
unseen_df_clean = one_hot_encoding(unseen_df_clean)

### Visualization

In [None]:
plt.Figure(figsize=(10, 10))
sns.histplot(data=df_clean, x='Item_Vis_Log', bins=50)
plt.show()

In [None]:
df_clean[df_clean['Item_Vis_Log'] < 1].shape

### Correlation

In [None]:
num_col = [ 'Item_Weight', 'Item_Visibility', 'Item_MRP', 'Item_Identifier_encoded', 'Outlet_Total_Years','Item_Outlet_Sales']
plt.Figure(figsize=(10, 10))
sns.heatmap(df_clean[num_col].corr(), annot=True)
plt.show()

### Train- Test Split

In [None]:
y = df_clean['Item_Outlet_Sales'].iloc[2000:3000]
X = df_clean.drop(['Item_Outlet_Sales', 'Item_Identifier', 'Item_Visibility', 'Item_Identifier_encoded'], axis=1).iloc[2000:3000]
X_unseen = unseen_df_clean.drop(['Item_Identifier', 'Item_Visibility', 'Item_Identifier_encoded'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape, X_unseen.shape

### Scalling

In [None]:
scaler = StandardScaler()
scaling_col = ['Item_Weight', 'Item_Vis_Log', 'Item_MRP', 'Outlet_Total_Years']
X_train[scaling_col] = scaler.fit_transform(X_train[scaling_col])
X_unseen[scaling_col] = scaler.transform(X_unseen[scaling_col])
X_train.head()

### Model Evaluation

In [None]:
def model_evaluation(y_train, y_train_pred, y_test, y_test_pred, model='Regression'):
    print("*" * 80)
    print("*" * 30, model, '*' * 30)
    print("*" * 80)
    print('Train Data : ')
    print('RMSE = ', root_mean_squared_error(y_train, y_train_pred))
    print('R2_score = ', r2_score(y_train, y_train_pred))
    print('\n\nTest Data : ')
    print('RMSE = ', root_mean_squared_error(y_test, y_test_pred))
    print('R2_score = ', r2_score(y_test, y_test_pred))


### Random Forest

In [None]:
param = {
    'n_estimators': [ 50, 70, 100, 150, 200],
    'max_depth': [5, 7, 10],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [5, 10, 15],
    'random_state': [42]
}


rf = RandomForestRegressor()

# 1. Define the RMSE function
def root_mean_squared_error(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    return np.sqrt(mse)

# 2. Create the RMSE scorer using make_scorer
rmse_scorer = make_scorer(root_mean_squared_error, greater_is_better=False) # Important!

grid_search = GridSearchCV(estimator=rf,
                           scoring=rmse_scorer,  # Use the scorer here
                           verbose=2,
                           cv=4,
                           param_grid=param,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)

In [None]:
best_estimator = grid_search.best_estimator_

y_train_pred = best_estimator.predict(X_train)
y_test_pred = best_estimator.predict(X_test)
y_unseen_pred = best_estimator.predict(X_unseen)
model_evaluation(y_train, y_train_pred, y_test, y_test_pred, 'RF')

In [None]:
best_estimator

### Linear Rgression

In [None]:
# lr = LinearRegression()
# lr.fit(X_train, y_train)

In [None]:
# y_train_pred = lr.predict(X_train)
# y_test_pred = lr.predict(X_test)
# y_unseen_pred = lr.predict(X_unseen)
# model_evaluation(y_train, y_train_pred, y_test, y_test_pred)

### Ridge Regularization

In [None]:
# param = {'alpha': np.logspace(-3, 3, 7)}

# ridge = Ridge()


# def root_mean_squared_error(y_true, y_pred):
#     mse = mean_squared_error(y_true, y_pred)
#     return np.sqrt(mse)

# rmse_scorer = make_scorer(root_mean_squared_error, greater_is_better=False)
# grid_search = GridSearchCV(estimator=ridge,
#                            param_grid=param,
#                            cv=5,
#                            scoring=rmse_scorer,
#                            n_jobs=-1,
#                            verbose=2)

# grid_search.fit(X_train, y_train)

In [None]:
# best_estimator = grid_search.best_estimator_
# y_train_pred = best_estimator.predict(X_train)
# y_test_pred = best_estimator.predict(X_test)
# y_unseen_pred = best_estimator.predict(X_unseen)
# model_evaluation(y_train, y_train_pred, y_test, y_test_pred, best_estimator)

### Lasso Regularization

In [None]:
# param = {'alpha': np.logspace(-3, 3, 7)}

# lasso = Lasso()

# def root_mean_squared_error(y_true, y_pred):
#     mse = mean_squared_error(y_true, y_pred)
#     return np.sqrt(mse)


# rmse_scorer = make_scorer(root_mean_squared_error, greater_is_better=False)
# grid_search = GridSearchCV(estimator=lasso,
#                            param_grid=param,
#                            cv=5,
#                            scoring=rmse_scorer,
#                            n_jobs=-1,
#                            verbose=2)

# grid_search.fit(X_train, y_train)

### Submission File

In [None]:
path = '/home/pc/Desktop_linux/chinu/big_mart_sales_prediction/sample_submission_8RXa3c6.csv'
submission = pd.read_csv(path)
submission.head()

In [None]:
y_unseen_pred

In [None]:
unseen_path = '/home/pc/Desktop_linux/chinu/big_mart_sales_prediction/test_AbJTz2l.csv'
unseen_df = pd.read_csv(unseen_path)
unseen_df.head()


In [None]:
unseen_path = '/home/pc/Desktop_linux/chinu/big_mart_sales_prediction/test_AbJTz2l.csv'
unseen_df = pd.read_csv(unseen_path)
submission_df = unseen_df[['Item_Identifier', 'Outlet_Identifier']]
submission_df['Item_Outlet_Sales'] = y_unseen_pred
submission_df.head()

In [None]:
write_file_path = '/home/pc/Desktop_linux/chinu/big_mart_sales_prediction/submission/1st_attempt.csv'
submission_df.to_csv(write_file_path, index=False)