In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings("ignore")

# Load the data
df = pd.read_csv("/kaggle/input/cse-281-24-predict-the-item-price/train.csv")
df_test = pd.read_csv("/kaggle/input/cse-281-24-predict-the-item-price/test.csv")

# Filling Missing Data
df['X3'].replace({'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'}, inplace=True)
df_test['X3'].replace({'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'}, inplace=True)

df['X2'].fillna(df['X2'].mean(), inplace=True)
df_test['X2'].fillna(df_test['X2'].mean(), inplace=True)

mode_X9 = df.groupby('X11')['X9'].apply(lambda x: x.mode()[0])
df['X9'].fillna(df['X11'].map(mode_X9), inplace=True)
df_test['X9'].fillna(df_test['X11'].map(mode_X9), inplace=True)

df['X1'] = df['X1'].apply(lambda x: x[:2])
df_test['X1'] = df_test['X1'].apply(lambda x: x[:2])
df.replace({'X1': {'FD': 'Food', 'DR': 'Drink', 'NC': 'Non Consumable'}}, inplace=True)
df_test.replace({'X1': {'FD': 'Food', 'DR': 'Drink', 'NC': 'Non Consumable'}}, inplace=True)

# Encoding
label_encoder = LabelEncoder()
df['X9'] = label_encoder.fit_transform(df['X9'])
df_test['X9'] = label_encoder.transform(df_test['X9'])
df['X10'] = label_encoder.fit_transform(df['X10'])
df_test['X10'] = label_encoder.transform(df_test['X10'])
df['X11'] = label_encoder.fit_transform(df['X11'])
df_test['X11'] = label_encoder.transform(df_test['X11'])

columns_to_encode = ['X1', 'X3', 'X5', 'X7']
df = pd.get_dummies(df, columns=columns_to_encode, prefix_sep='_')
df_test = pd.get_dummies(df_test, columns=columns_to_encode, prefix_sep='_')

# Feature selection
columns_to_test = ['X2', 'X4', 'X9', 'X6', 'X8', 'X10', 'X11']
features = []

for col in df.columns:
    if df[col].corr(df['Y']) >= 0.25 or df[col].corr(df['Y']) <= -0.25:
        features.append(col)

temp_list = []
for col1 in columns_to_test:
    temp_list.append(col1)
    for col2 in columns_to_test:
        if col2 not in temp_list:
            df[f'{col1}_{col2}'] = df[col1] * df[col2]
            df_test[f'{col1}_{col2}'] = df_test[col1] * df_test[col2]
            if df[f'{col1}_{col2}'].corr(df['Y']) >= 0.4 or df[f'{col1}_{col2}'].corr(df['Y']) <= -0.4:
                features.append(f'{col1}_{col2}')

features.remove('Y')

# Splitting data into features and target
X = df[features]
y = df['Y']
x_test = df_test[features]

# Model training and predictions for submission
rf_model = RandomForestRegressor(n_estimators=100, random_state=321)
rf_model.fit(X, y)

y_pred = rf_model.predict(x_test)

# Prepare submission file
submission = pd.DataFrame({'Y': y_pred})
submission['row_id'] = range(submission.shape[0])
submission = submission[['row_id', 'Y']]
submission.to_csv('submission.csv', index=False)

# Model evaluation using train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=321)

rf_model = RandomForestRegressor(n_estimators=100, random_state=321)
rf_model.fit(X_train, y_train)

y_pred_test = rf_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred_test)
print(f"Mean Absolute Error (MAE): {mae}")

Mean Absolute Error (MAE): 0.43724090555555556
