# Nexford Final Project: Pricing Optimization using Linear Regression
This notebook demonstrates demand prediction using a Linear Regression model with pricing-related features and dummy variables handled properly.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm


In [None]:
df = pd.read_csv("retail_store_inventory.csv")
df.head()


## Data Cleaning and Feature Preparation

In [None]:
# Drop missing values
df.dropna(inplace=True)

# Convert Discount to decimal
df['Discount'] = df['Discount'] / 100.0

# Create dummy variables
df = pd.get_dummies(df, columns=['Region', 'Seasonality', 'Weather Condition'], drop_first=True)

# Define features
features = ['Price', 'Discount', 'Competitor Pricing', 'Holiday/Promotion'] +            [col for col in df.columns if 'Region_' in col or 'Seasonality_' in col or 'Weather Condition_' in col]

X = df[features].copy()
y = df['Units Sold'].astype(float)


## Train the Linear Regression Model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("R-squared:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


## Confidence Intervals Using OLS

In [None]:
# Ensure all data is float
X_train_ols = sm.add_constant(X_train.astype(float))
ols_model = sm.OLS(y_train.astype(float), X_train_ols).fit()
print(ols_model.summary())


## Predict Demand Function

In [None]:
def predict_demand(price, discount, competitor_price, holiday, **kwargs):
    # Create a base dictionary
    input_data = {
        'Price': price,
        'Discount': discount,
        'Competitor Pricing': competitor_price,
        'Holiday/Promotion': holiday
    }

    # Add dummy variables
    for col in X.columns:
        if col not in input_data:
            input_data[col] = kwargs.get(col, 0)

    input_df = pd.DataFrame([input_data])
    return model.predict(input_df)[0]

# Example
predict_demand(30, 0.1, 28, 1, **{'Region_North': 1, 'Seasonality_Winter': 1, 'Weather Condition_Sunny': 1})


## Conclusion
- Linear Regression model trained on cleaned retail data.
- OLS summary shows significance and confidence of each feature.
- `predict_demand()` enables dynamic user input integration in apps like Streamlit.