In [None]:
%cd /content
!git clone https://github.com/Amika1118/DSGP_Group_38.git
%cd DSGP_Group_38

/content
fatal: destination path 'DSGP_Group_38' already exists and is not an empty directory.
/content/DSGP_Group_38


In [None]:
!git checkout Market-Price-Prediction

Branch 'Market-Price-Prediction' set up to track remote branch 'Market-Price-Prediction' from 'origin'.
Switched to a new branch 'Market-Price-Prediction'


In [None]:
!git config --global user.name "Lasani Layathma"
!git config --global user.email "lasani.20241357@iit.ac.lk"

In [None]:
from getpass import getpass
token = getpass("Enter GitHub token: ")
!git remote set-url origin https://{token}@github.com/Amika1118/DSGP_Group_38.git

Enter GitHub token: ··········


In [2]:
# Setup and load data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('weekly_preprocessed_with_fuel.csv')
df['Date'] = pd.to_datetime(df['Date'])
print(f"Loaded {len(df)} rows, {len(df.columns)} columns")


Loaded 4758 rows, 32 columns


In [3]:
# Feature Selection - Remove highly correlated features

def select_features(df, threshold=0.95):
    """
    Select features while handling multicollinearity
    """
    # Identify feature groups that are highly correlated
    feature_groups = {
        'lags': ['Price_Lag_1', 'Price_Lag_2', 'Price_Lag_3', 'Price_Lag_4', 'Price_Lag_8', 'Price_Lag_12', 'Price_Lag_52'],
        'rolling_means': ['Rolling_Mean_4', 'Rolling_Mean_8', 'Rolling_Mean_12'],
        'rolling_stats': ['Rolling_Std_4', 'Rolling_Min_4', 'Rolling_Max_4'],
        'price_changes': ['Price_Change_1wk', 'Price_Change_Pct_1wk'],
        'time': ['Year', 'Month', 'Week_of_Year', 'Quarter'],
        'external': ['Fuel_Price']
    }

    selected_features = []

    # Keep only one from highly correlated groups
    selected_features.extend(['Price_Lag_1', 'Price_Lag_52'])  # Most important lags
    selected_features.extend(['Rolling_Mean_4'])  # Keep shortest rolling mean
    selected_features.extend(['Rolling_Std_4', 'Rolling_Max_4'])  # Volatility and range
    selected_features.extend(['Price_Change_1wk'])  # Keep one price change feature
    selected_features.extend(['Fuel_Price'])

    return selected_features

# Apply feature selection
selected_features = select_features(df)
print("Selected base features:", selected_features)

Selected base features: ['Price_Lag_1', 'Price_Lag_52', 'Rolling_Mean_4', 'Rolling_Std_4', 'Rolling_Max_4', 'Price_Change_1wk', 'Fuel_Price']


In [4]:
# Create cyclical time features
df_engineered = df.copy()

# Month cyclical encoding
df_engineered['Month_Sin'] = np.sin(2 * np.pi * df_engineered['Month'] / 12)
df_engineered['Month_Cos'] = np.cos(2 * np.pi * df_engineered['Month'] / 12)

# Week of Year cyclical encoding
df_engineered['Week_Sin'] = np.sin(2 * np.pi * df_engineered['Week_of_Year'] / 52)
df_engineered['Week_Cos'] = np.cos(2 * np.pi * df_engineered['Week_of_Year'] / 52)

# Quarter as categorical (but cyclical encoding is better)
# Optional: Create season indicator
df_engineered['Season'] = (df_engineered['Month'] % 12 + 3) // 3  # 1:Winter,2:Spring,3:Summer,4:Fall

print("Added cyclical time features")

Added cyclical time features


In [5]:
# Create interaction features based on EDA findings

def create_interaction_features(df):
    """
    Create interaction features based on EDA insights
    """
    df = df.copy()

    # 1. Fuel price interaction with lags (fuel effect may depend on recent prices)
    df['Fuel_x_Lag1'] = df['Fuel_Price'] * df['Price_Lag_1']
    df['Fuel_x_Lag52'] = df['Fuel_Price'] * df['Price_Lag_52']

    # 2. Volatility and price level interaction (high volatility in high price periods)
    df['Price_Level_x_Volatility'] = df['Price_Lag_1'] * df['Rolling_Std_4']

    # 3. Momentum and volatility (momentum may be stronger in volatile periods)
    df['Momentum_x_Volatility'] = df['Price_Change_1wk'] * df['Rolling_Std_4']

    # 4. Seasonal price patterns (interaction of time with price level)
    df['Month_x_Price'] = df['Month'] * df['Price_Lag_1']

    # 5. Fuel price with season (fuel impact may vary by season)
    df['Fuel_x_Month'] = df['Fuel_Price'] * df['Month']

    return df

df_engineered = create_interaction_features(df_engineered)
print("Added interaction features")
print("New features:", [col for col in df_engineered.columns if col not in df.columns])

Added interaction features
New features: ['Month_Sin', 'Month_Cos', 'Week_Sin', 'Week_Cos', 'Season', 'Fuel_x_Lag1', 'Fuel_x_Lag52', 'Price_Level_x_Volatility', 'Momentum_x_Volatility', 'Month_x_Price', 'Fuel_x_Month']


In [6]:
# Create optimal lags for fuel price
# EDA showed peak correlation at lags 1-4 weeks

def create_optimal_fuel_lags(df, veg_name):
    """
    Create fuel price lags optimized for each vegetable
    Based on EDA showing peak correlation at different lags
    """
    df = df.copy()

    # Create multiple fuel lags (let the model decide importance)
    df['Fuel_Lag_1'] = df.groupby('Vegetable')['Fuel_Price'].shift(1)
    df['Fuel_Lag_2'] = df.groupby('Vegetable')['Fuel_Price'].shift(2)
    df['Fuel_Lag_3'] = df.groupby('Vegetable')['Fuel_Price'].shift(3)
    df['Fuel_Lag_4'] = df.groupby('Vegetable')['Fuel_Price'].shift(4)

    # Create rolling fuel price (smoothed fuel impact)
    df['Fuel_Rolling_Mean_4'] = df.groupby('Vegetable')['Fuel_Price'].transform(
        lambda x: x.rolling(window=4, min_periods=1).mean()
    )

    return df

# Get the list of vegetables
vegetables = df_engineered['Vegetable'].unique()

# Then pass it to the function
df_engineered = create_optimal_fuel_lags(df_engineered, vegetables)

df_engineered = create_optimal_fuel_lags(df_engineered, vegetables)
print("Added fuel price lag features")

Added fuel price lag features


In [7]:
# Create rate of change features
def create_rate_features(df):
    """
    Create features capturing rate of change at different horizons
    """
    df = df.copy()

    for veg in vegetables:
        mask = df['Vegetable'] == veg

        # Rate of change over different periods
        df.loc[mask, 'Price_Change_4wk'] = df.loc[mask, 'Price'] - df.loc[mask, 'Price_Lag_4']
        df.loc[mask, 'Price_Change_12wk'] = df.loc[mask, 'Price'] - df.loc[mask, 'Price_Lag_12']

        # Percentage change
        df.loc[mask, 'Price_Change_Pct_4wk'] = (df.loc[mask, 'Price'] / df.loc[mask, 'Price_Lag_4'] - 1) * 100
        df.loc[mask, 'Price_Change_Pct_12wk'] = (df.loc[mask, 'Price'] / df.loc[mask, 'Price_Lag_12'] - 1) * 100

        # Acceleration (change in rate of change)
        df.loc[mask, 'Price_Acceleration'] = df.loc[mask, 'Price_Change_1wk'].diff()

    return df

df_engineered = create_rate_features(df_engineered)
print("Added rate of change features")

Added rate of change features


In [8]:
# Create volatility regime features
def create_volatility_features(df):
    """
    Create features indicating volatility regime
    """
    df = df.copy()

    for veg in vegetables:
        mask = df['Vegetable'] == veg

        # Rolling volatility percentile
        rolling_std = df.loc[mask, 'Rolling_Std_4']
        df.loc[mask, 'Volatility_Percentile'] = rolling_std.rank(pct=True)

        # High volatility indicator (top 25% volatility)
        volatility_threshold = rolling_std.quantile(0.75)
        df.loc[mask, 'High_Volatility'] = (rolling_std > volatility_threshold).astype(int)

        # Volatility change
        df.loc[mask, 'Volatility_Change'] = df.loc[mask, 'Rolling_Std_4'].diff()

    return df

df_engineered = create_volatility_features(df_engineered)
print("Added volatility regime features")

Added volatility regime features


In [9]:
# Define final feature set for modeling
def get_final_feature_set():
    """
    Return the final selected feature set based on importance and domain knowledge
    """
    features = {
        # Essential lag features (from EDA)
        'essential_lags': ['Price_Lag_1', 'Price_Lag_52'],

        # Rolling statistics (selected based on correlation analysis)
        'rolling_stats': ['Rolling_Mean_4', 'Rolling_Std_4', 'Rolling_Max_4'],

        # Price changes (momentum)
        'momentum': ['Price_Change_1wk', 'Price_Change_4wk'],

        # Cyclical time features (new)
        'cyclical_time': ['Month_Sin', 'Month_Cos', 'Week_Sin', 'Week_Cos'],

        # External factors
        'external': ['Fuel_Price', 'Fuel_Lag_2', 'Fuel_Rolling_Mean_4'],

        # Interaction features (from feature importance)
        'interactions': ['Fuel_x_Lag1', 'Price_Level_x_Volatility'],

        # Volatility regime
        'volatility': ['Volatility_Percentile', 'High_Volatility'],

        # Categorical
        'categorical': ['Vegetable_Code']
    }

    # Flatten the list
    final_features = []
    for category, feat_list in features.items():
        final_features.extend(feat_list)

    return final_features, features

final_features, feature_categories = get_final_feature_set()
print("Final feature set by category:")
for category, feats in feature_categories.items():
    print(f"\n{category.upper()}:")
    for feat in feats:
        print(f"  - {feat}")

Final feature set by category:

ESSENTIAL_LAGS:
  - Price_Lag_1
  - Price_Lag_52

ROLLING_STATS:
  - Rolling_Mean_4
  - Rolling_Std_4
  - Rolling_Max_4

MOMENTUM:
  - Price_Change_1wk
  - Price_Change_4wk

CYCLICAL_TIME:
  - Month_Sin
  - Month_Cos
  - Week_Sin
  - Week_Cos

EXTERNAL:
  - Fuel_Price
  - Fuel_Lag_2
  - Fuel_Rolling_Mean_4

INTERACTIONS:
  - Fuel_x_Lag1
  - Price_Level_x_Volatility

VOLATILITY:
  - Volatility_Percentile
  - High_Volatility

CATEGORICAL:
  - Vegetable_Code


In [10]:
# Save engineered features
df_engineered.to_csv('weekly_features_engineered.csv', index=False)
print("Saved engineered features")

Saved engineered features
