**Data Loading and Preparation**

In [8]:
import pandas as pd
import numpy as np

# Load the item and sales Data
items_df = pd.read_csv(r"C:\Users\09sra\Downloads\Data Science and Engineering Assignment Datasets\Data Sciene Internship Assignment Datasets\item.csv")
sales_df = pd.read_csv(r"C:\Users\09sra\Downloads\Data Science and Engineering Assignment Datasets\Data Sciene Internship Assignment Datasets\sales.csv")

# Merge sales and items datasets on the 'code' column
merged_df = pd.merge(sales_df, items_df, on='code', how='inner')

# Create a binary feature for voucher usage
merged_df['voucher_used'] = (merged_df['voucher'] > 0).astype(int)

# Create a day-of-week feature from 'day'
merged_df['day_of_week'] = merged_df['day'] % 7


**Feature Engineering and Model Preparation**

In [9]:
# Extracting columns relevant for modeling
model_df = merged_df[['units', 'voucher_used', 'type', 'brand', 'size', 'day_of_week']].copy()

# Drop any rows with NaN
model_df.dropna(subset=['units','voucher_used','type','brand','size','day_of_week'], inplace=True)

# Our target is 'units'
y = model_df['units']

# creating a Feature matrix by dropping 'units' column
X = model_df.drop('units', axis=1)

# One-hot encoding for  categorical features such as type, brand, size, day_of_week
X = pd.get_dummies(X, columns=['type','brand','size','day_of_week'], drop_first=True)


**Model Training and Evaluation**

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Split data into train and test(80% training data and 20% testing data)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create and train the RandomForestRegressor model
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# Evaluating to check model performance using RÂ² score
r2_score_train = model.score(X_train, y_train)
r2_score_test  = model.score(X_test, y_test)

print(f"Train R^2: {r2_score_train:.3f}")
print(f"Test R^2:  {r2_score_test:.3f}")


Train R^2: 0.041
Test R^2:  0.032


**Uplift Analysis**

In [11]:
item_types = merged_df['type'].unique()
recommendations = []

# calculates predicted uplift in units sold when voucher is applied
for item_type in item_types:
    # Filter rows for this particular item_type
    subset = merged_df[merged_df['type'] == item_type].copy()
    
    if subset.empty:
        continue  

    # build a dummy-encoded version of the relevant features
    subset_features = subset[['voucher_used', 'type', 'brand', 'size', 'day_of_week']].copy()
    subset_features = pd.get_dummies(
        subset_features, 
        columns=['type','brand','size','day_of_week'],
        drop_first=True
    )
    
    # Make sure we align columns with the model's X
    all_needed_cols = set(X.columns)  # columns used in training
    subset_cols = set(subset_features.columns)
    missing_cols = all_needed_cols - subset_cols
    
    # Add missing columns as 0 in one go to avoid fragmentation
    # This ensures the same dimensionality as X
    if missing_cols:
        # create a zero-filled DataFrame for missing cols
        zeros_df = pd.DataFrame(0, index=subset_features.index, columns=list(missing_cols))
        # concatenate
        subset_features = pd.concat([subset_features, zeros_df], axis=1)
    
    # Reorder to match X exactly
    subset_features = subset_features[X.columns]

    # Compute mean row for that item type
    mean_vector = subset_features.mean()

    # Create two copies, no-voucher and voucher
    mean_vector_no_voucher = mean_vector.copy()
    mean_vector_no_voucher['voucher_used'] = 0
    
    mean_vector_voucher = mean_vector.copy()
    mean_vector_voucher['voucher_used'] = 1

    # Convert these Series to DataFrame rows
    df_no_voucher = pd.DataFrame([mean_vector_no_voucher], columns=X.columns)
    df_voucher    = pd.DataFrame([mean_vector_voucher], columns=X.columns)

    # Predict the no of units sold with and without voucher
    pred_no_voucher = model.predict(df_no_voucher)[0]
    pred_voucher    = model.predict(df_voucher)[0]

    # Uplift = difference
    uplift = pred_voucher - pred_no_voucher
    recommendations.append((item_type, uplift))

# Sort by descending uplift
recommendations.sort(key=lambda x: x[1], reverse=True)

# Output recommended Voucher Allocation
print("Recommended Voucher Allocation (Predicted Uplift in Units):")
for item_type, uplift_val in recommendations:
    print(f"{item_type}: Uplift = {uplift_val:.2f}")


Recommended Voucher Allocation (Predicted Uplift in Units):
Type 2: Uplift = 0.18
Type 3: Uplift = 0.07
Type 1: Uplift = 0.07
Type 4: Uplift = 0.07
