# Nexford Final Project: Pricing Optimization using Linear Regression
This notebook demonstrates demand prediction using a Linear Regression model with pricing-related features and dummy variables handled properly.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm


In [3]:
df = pd.read_csv("retail_store_inventory.csv")
df.head()


Unnamed: 0,Date,Store ID,Product ID,Category,Region,Inventory Level,Units Sold,Units Ordered,Demand Forecast,Price,Discount,Weather Condition,Holiday/Promotion,Competitor Pricing,Seasonality
0,2022-01-01,S001,P0001,Groceries,North,231,127,55,135.47,33.5,20,Rainy,0,29.69,Autumn
1,2022-01-01,S001,P0002,Toys,South,204,150,66,144.04,63.01,20,Sunny,0,66.16,Autumn
2,2022-01-01,S001,P0003,Toys,West,102,65,51,74.02,27.99,10,Sunny,1,31.32,Summer
3,2022-01-01,S001,P0004,Toys,North,469,61,164,62.18,32.72,10,Cloudy,1,34.74,Autumn
4,2022-01-01,S001,P0005,Electronics,East,166,14,135,9.26,73.64,0,Sunny,0,68.95,Summer


## Data Cleaning and Feature Preparation

In [5]:
# Drop missing values
df.dropna(inplace=True)

# Convert Discount to decimal
df['Discount'] = df['Discount'] / 100.0

# Create dummy variables
df = pd.get_dummies(df, columns=['Region', 'Seasonality', 'Weather Condition'], drop_first=True)

# Define features
features = ['Price', 'Discount', 'Competitor Pricing', 'Holiday/Promotion'] +            [col for col in df.columns if 'Region_' in col or 'Seasonality_' in col or 'Weather Condition_' in col]

X = df[features].copy()
y = df['Units Sold'].astype(float)


## Train the Linear Regression Model

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("R-squared:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


R-squared: -0.0010692515323100782
RMSE: 108.86837975395044


## Confidence Intervals Using OLS

In [9]:
# Ensure all data is float
X_train_ols = sm.add_constant(X_train.astype(float))
ols_model = sm.OLS(y_train.astype(float), X_train_ols).fit()
print(ols_model.summary())


                            OLS Regression Results                            
Dep. Variable:             Units Sold   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.711
Date:                Mon, 19 May 2025   Prob (F-statistic):             0.0518
Time:                        14:33:48   Log-Likelihood:            -3.5729e+05
No. Observations:               58480   AIC:                         7.146e+05
Df Residuals:                   58466   BIC:                         7.147e+05
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

## Predict Demand Function

In [11]:
def predict_demand(price, discount, competitor_price, holiday, **kwargs):
    # Create a base dictionary
    input_data = {
        'Price': price,
        'Discount': discount,
        'Competitor Pricing': competitor_price,
        'Holiday/Promotion': holiday
    }

    # Add dummy variables
    for col in X.columns:
        if col not in input_data:
            input_data[col] = kwargs.get(col, 0)

    input_df = pd.DataFrame([input_data])
    return model.predict(input_df)[0]

# Example
predict_demand(30, 0.1, 28, 1, **{'Region_North': 1, 'Seasonality_Winter': 1, 'Weather Condition_Sunny': 1})


139.98425220332712

## Conclusion
- Linear Regression model trained on cleaned retail data.
- OLS summary shows significance and confidence of each feature.
- `predict_demand()` enables dynamic user input integration in apps like Streamlit.