In [5]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [10]:
# Reload test and train datasets
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

In [24]:
# Combine both datasets for consistent preprocessing
test_copy = test.copy()
combined = pd.concat([train.drop('Item_Outlet_Sales', axis=1), test_copy], axis=0)

In [22]:
# Convert object columns to category
for col in test_processed.select_dtypes(include='object').columns:
    test_processed[col] = test_processed[col].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_processed[col] = test_processed[col].astype('category')


In [12]:
# Handle missing values
combined['Item_Weight'].fillna(combined['Item_Weight'].mean(), inplace=True)
combined['Outlet_Size'].fillna(combined['Outlet_Size'].mode()[0], inplace=True)

In [13]:
# Standardize 'Item_Fat_Content'
combined['Item_Fat_Content'] = combined['Item_Fat_Content'].replace({'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'})

In [14]:
# Derive 'Item_Category' from 'Item_Identifier'
combined['Item_Category'] = combined['Item_Identifier'].apply(lambda x: x[:2])
combined['Item_Category'] = combined['Item_Category'].map({'FD': 'Food','NC': 'Non-Consumable','DR': 'Drinks'})

In [15]:
# Set 'Item_Fat_Content' to 'Non-Edible' for Non-Consumables
combined.loc[combined['Item_Category'] == "Non-Consumable", 'Item_Fat_Content'] = "Non-Edible"

In [16]:
# Encode categorical variables
categorical_cols = ['Item_Fat_Content', 'Outlet_Identifier', 'Outlet_Size','Outlet_Location_Type', 'Outlet_Type', 'Item_Category']

le = LabelEncoder()
for col in categorical_cols:
    combined[col] = le.fit_transform(combined[col])

In [27]:
# Columns to encode
cat_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 
            'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

# Apply LabelEncoder (assuming same encoder as used in training)
for col in cat_cols:
    le = LabelEncoder()
    # Fit on train + test combined or just reuse encoder from training
    test[col] = le.fit_transform(test[col])
    
# Also make sure no NaNs remain
test = test.fillna(0)

In [30]:
# Make a copy
test_processed = test.copy()

# Fill missing values (if not already done)
test_processed['Item_Weight'].fillna(test_processed['Item_Weight'].mean(), inplace=True)
test_processed['Outlet_Size'].fillna(test_processed['Outlet_Size'].mode()[0], inplace=True)

# Label encode object (categorical) columns
cat_cols = test_processed.select_dtypes(include='object').columns

le = LabelEncoder()
for col in cat_cols:
    test_processed[col] = le.fit_transform(test_processed[col].astype(str))

# Final check
print(test_processed.dtypes)

Item_Identifier                int32
Item_Weight                  float64
Item_Fat_Content               int32
Item_Visibility              float64
Item_Type                      int32
Item_MRP                     float64
Outlet_Identifier              int32
Outlet_Establishment_Year      int64
Outlet_Size                    int32
Outlet_Location_Type           int32
Outlet_Type                    int32
dtype: object


In [17]:
# Create 'Outlet_Years' feature
combined['Outlet_Years'] = 2013 - combined['Outlet_Establishment_Year']

In [18]:
# Drop unused columns
combined.drop(['Item_Identifier', 'Outlet_Establishment_Year'], axis=1, inplace=True)

In [19]:
# Extract the processed test set
test_processed = combined[len(train):]
test_processed.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Category,Outlet_Years
0,20.75,0,0.007565,Snack Foods,107.8622,9,1,0,1,1,14
1,8.3,2,0.038428,Dairy,87.3198,2,1,1,1,1,6
2,14.6,1,0.099575,Others,241.7538,0,1,2,0,2,15
3,7.315,0,0.015388,Snack Foods,155.034,2,1,1,1,1,6
4,12.792854,2,0.118599,Dairy,234.23,5,1,2,3,1,28


In [29]:
# At the end of Test_preprocessed.ipynb
test_processed.to_csv('Test_preprocessed.csv', index=False)
test_processed = test.copy()

In [33]:
import joblib

In [32]:
# Load the test_preprocessed
test_processed = joblib.load("test_processed.pkl")

# Drop columns not used in training
test_processed.drop(['Item_Identifier', 'Outlet_Identifier'], axis=1, inplace=True)

# One-hot encode the same categorical columns (use training columns as reference)
# You must use same columns as in training
model_features = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year',
 'Item_Fat_Content_Low Fat', 'Item_Fat_Content_Regular', 'Item_Fat_Content_low fat', 'Item_Fat_Content_reg',
 'Item_Type_Breads', 'Item_Type_Breakfast', 'Item_Type_Canned', 'Item_Type_Dairy', 'Item_Type_Frozen Foods',
 'Item_Type_Fruits and Vegetables', 'Item_Type_Hard Drinks', 'Item_Type_Health and Hygiene',
 'Item_Type_Household', 'Item_Type_Meat', 'Item_Type_Others', 'Item_Type_Seafood', 'Item_Type_Snack Foods',
 'Item_Type_Soft Drinks', 'Item_Type_Starchy Foods', 'Outlet_Size_Medium', 'Outlet_Size_Small',
 'Outlet_Location_Type_Tier 2', 'Outlet_Location_Type_Tier 3', 'Outlet_Type_Supermarket Type1',
 'Outlet_Type_Supermarket Type2', 'Outlet_Type_Supermarket Type3']

# One-hot encode test set
test_encoded = pd.get_dummies(test_processed)

# Add missing columns
for col in model_features:
    if col not in test_encoded.columns:
        test_encoded[col] = 0

# Ensure same column order
test_encoded = test_encoded[model_features]

# Save the updated test set
joblib.dump(test_encoded, 'test_final_processed.pkl')

['test_final_processed.pkl']