# Generate synthetic data

In [None]:
import numpy as np
import pandas as pd

# Set a random seed for reproducibility
np.random.seed(0)

# Generate synthetic data with integers
data = {
    'Marketing Spend (£1000s)': np.random.normal(50, 10, 100).astype(int),
    'Store Footfall (1000 people)': np.random.normal(2000, 300, 100).astype(int),
    'Online Engagement (1000 minutes)': np.random.normal(100, 20, 100).astype(int),
    'Monthly Sales (£1000s)': np.zeros(100, dtype=int)
}

df = pd.DataFrame(data)

# Assume a linear model for sales based on the data
df['Monthly Sales (£1000s)'] = (
    11 + 1.7*df['Marketing Spend (£1000s)'] +
    0.5*df['Store Footfall (1000 people)'] +
    0.2*df['Online Engagement (1000 minutes)'] +
    np.random.normal(0, 10, 100)
).astype(int)

# Save the data to an Excel file
df.to_excel('synthetic_sales_data.xlsx', index=False)


# Data analysis

In [None]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

# Define the features and target with an intercept for statsmodels
X = df[['Marketing Spend (£1000s)', 'Store Footfall (1000 people)', 'Online Engagement (1000 minutes)']]
X = sm.add_constant(X)  # adding a constant for the intercept
y = df['Monthly Sales (£1000s)']

# Fit the model
model = sm.OLS(y, X).fit()

# Display the regression results
print(model.summary())

# Extract and display the regression equation
coefficients = model.params
equation = f"Y = {coefficients[0]:.2f}"
for i in range(1, len(coefficients)):
    equation += f" + ({coefficients[i]:.2f} * X{i})"

# Calculate and display Variance Inflation Factor (VIF) for each variable
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print("\nVariance Inflation Factor (VIF):")
print(vif_data)

print("\nMultiple Regression Equation:")
print(equation)




                              OLS Regression Results                              
Dep. Variable:     Monthly Sales (£1000s)   R-squared:                       0.997
Model:                                OLS   Adj. R-squared:                  0.997
Method:                     Least Squares   F-statistic:                     9688.
Date:                    Fri, 07 Jun 2024   Prob (F-statistic):          5.43e-119
Time:                            17:13:45   Log-Likelihood:                -363.16
No. Observations:                     100   AIC:                             734.3
Df Residuals:                          96   BIC:                             744.7
Df Model:                               3                                         
Covariance Type:                nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------