In [7]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [2]:
# Load the dataset
df = pd.read_csv("Train.csv")

In [65]:
import joblib

# Load the final processed test set
test_final = joblib.load("test_final_processed.pkl")

In [4]:
# Copy the dataset to avoid changing original
data = df.copy()

In [18]:
# Ensure 'Item_Outlet_Sales' is in the dataset
assert 'Item_Outlet_Sales' in data.columns

In [19]:
# Check for any object (non-numeric) dtypes
non_numeric_cols = data.select_dtypes(include=['object']).columns.tolist()
print("Non-numeric columns:", non_numeric_cols)

Non-numeric columns: ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']


In [20]:
# Drop them if any were missed
data_cleaned = data.drop(non_numeric_cols, axis=1)

In [25]:
# Check for missing values
print("Missing values before fixing:")
print(data.isnull().sum())

Missing values before fixing:
Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64


In [26]:
# Fill or drop missing values
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

In [27]:
# Fill categorical columns with mode
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    data[col] = data[col].fillna(data[col].mode()[0])

In [28]:
# Check again
print("\nMissing values after fixing:")
print(data.isnull().sum())


Missing values after fixing:
Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64


In [34]:
print("Min:", y.min())
print("Max:", y.max())
print("Mean:", y.mean())

Min: 33.29
Max: 13086.9648
Mean: 2181.2889135750365


In [36]:
data = data.drop(["Item_Identifier", "Outlet_Identifier"], axis=1)

In [59]:
xgb = XGBRegressor(random_state=42, enable_categorical=True)

In [37]:
# Encode categoricals
data_encoded = pd.get_dummies(data, drop_first=True)

In [38]:
# Split
X = data_encoded.drop("Item_Outlet_Sales", axis=1)
y = data_encoded["Item_Outlet_Sales"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

### RMSE and Cross-Validation

In [39]:
# Define a functiom to evaluate RMSE
def evaluate_model(model, name):
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    print(f"{name} RMSE: {rmse:.4f}")
    
    # Cross-validation
    scores = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=5)
    print(f"{name} CV RMSE: {-scores.mean():.4f} (+/- {scores.std():.4f})\n")

### Linear Regression

In [40]:
lr = LinearRegression()
evaluate_model(lr, "Linear Regression")

Linear Regression RMSE: 1069.0979
Linear Regression CV RMSE: 1132.9179 (+/- 14.2492)



### Model Benchmarking

1. Ridge Regression

In [42]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0)
evaluate_model(ridge, "Ridge Regression")

Ridge Regression RMSE: 1069.2254
Ridge Regression CV RMSE: 1132.8972 (+/- 14.2770)



2. Random Forest

In [43]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
evaluate_model(rf, "Random Forest")

Random Forest RMSE: 1098.7635
Random Forest CV RMSE: 1148.5762 (+/- 10.2115)



3. XG Boost

In [44]:
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
evaluate_model(xgb, "XGBoost Regressor")

XGBoost Regressor RMSE: 1065.4167
XGBoost Regressor CV RMSE: 1125.9240 (+/- 6.4054)



### Train XGBoost on Full Train Data

In [45]:
final_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
final_model.fit(X, y)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=100, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...)

### Prepare Test Set and Predict

In [66]:
# Make predictions using the best model
test_preds = final_model.predict(test_final)

In [67]:
# Load original test file to get identifiers
original_test = pd.read_csv("Test.csv")

In [68]:
# Prepare submission file
submission = pd.DataFrame({"Item_Identifier": original_test["Item_Identifier"],"Outlet_Identifier": original_test["Outlet_Identifier"],"Item_Outlet_Sales": test_preds})

In [69]:
# Save to CSV
submission.to_csv("submission.csv", index=False)
print("Submission file saved as 'submission.csv'")

Submission file saved as 'submission.csv'


In [70]:
# Save the final model
joblib.dump(final_model, 'final_model.pkl')
print("Model saved as 'final_model.pkl'")

Model saved as 'final_model.pkl'
