### Import Packages

In [22]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.metrics import mean_squared_error


### Define Functions

In [24]:

PRODUCT_NAMES = ['Product_A', 'Product_B', 'Product_C', 'Product_D', 'Product_E']


MONTHS = ['January', 'February', 'March', 'April', 'May', 'June', 
          'July', 'August', 'September', 'October', 'November', 'December']


def random_date(start, end):
    return start + timedelta(seconds=random.randint(0, int((end - start).total_seconds())))


def create_dummy_dataset(num_rows, date_start, date_end):
    date_start = datetime.strptime(date_start, '%Y-%m-%d')
    date_end = datetime.strptime(date_end, '%Y-%m-%d')
    

    dates = [random_date(date_start, date_end) for _ in range(num_rows)]
    

    data = {
        'Product_Name': [random.choice(PRODUCT_NAMES) for _ in range(num_rows)],  # Category 1: Product names
        'Month': [random.choice(MONTHS) for _ in range(num_rows)],                # Category 2: Months
        'Quantity': np.random.randint(1, 100, size=num_rows),                     # Category 3: Random quantities
        'Unit_Price': np.random.uniform(10, 100, size=num_rows)                   # Random unit price for each product
    }

    data['Sales_Revenue'] = data['Quantity'] * data['Unit_Price']               
    
    df = pd.DataFrame(data)
    print(df)
    return df

df = create_dummy_dataset(num_rows=1000, date_start='2023-01-01', date_end='2024-01-01')



    Product_Name      Month  Quantity  Unit_Price  Sales_Revenue
0      Product_E   November        50   54.208632    2710.431617
1      Product_B        May        42   61.631856    2588.537965
2      Product_E   November        91   69.705096    6343.163749
3      Product_B       June        31   38.221082    1184.853530
4      Product_C    October        40   15.495553     619.822110
..           ...        ...       ...         ...            ...
995    Product_B        May        26   77.326747    2010.495413
996    Product_B  September        83   48.984324    4065.698894
997    Product_E     August        71   25.161205    1786.445537
998    Product_D  September        42   45.860213    1926.128965
999    Product_D  September        84   12.369411    1039.030493

[1000 rows x 5 columns]


### Preprocessing

In [28]:
def preprocess_data(df):
    label_encoders = {}
    for col in ['Product_Name', 'Month']:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le  # Store encoders if needed later

    print(df)
    
    return df, label_encoders

df_processed, label_encoders = preprocess_data(df)

     Product_Name  Month  Quantity  Unit_Price  Sales_Revenue
0               4      9        50   54.208632    2710.431617
1               1      8        42   61.631856    2588.537965
2               4      9        91   69.705096    6343.163749
3               1      6        31   38.221082    1184.853530
4               2     10        40   15.495553     619.822110
..            ...    ...       ...         ...            ...
995             1      8        26   77.326747    2010.495413
996             1     11        83   48.984324    4065.698894
997             4      1        71   25.161205    1786.445537
998             3     11        42   45.860213    1926.128965
999             3     11        84   12.369411    1039.030493

[1000 rows x 5 columns]


In [29]:
def apply_lightgbm_regression(df):
    X = df.drop('Sales_Revenue', axis=1)  
    y = df['Sales_Revenue']              
    

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test)

    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9
    }
    

    lgb_model = lgb.train(params, train_data, valid_sets=[test_data], num_boost_round=100)
    y_pred = lgb_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print("Mean Squared Error:",mse)
    
    return lgb_model


model = apply_lightgbm_regression(df_processed)


Mean Squared Error: 16175.18127740074
