In [2]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(0)

# Generate a sample dataset
num_samples = 1000

# Generate predictor variables
X1 = np.random.normal(loc=0, scale=1, size=num_samples)
X2 = np.random.uniform(low=0, high=1, size=num_samples)
X3 = np.random.normal(loc=0, scale=2, size=num_samples)
X4 = np.random.randint(low=0, high=2, size=num_samples)
X5 = np.random.normal(loc=0, scale=1, size=num_samples)

# Generate noise variable
noise = np.random.normal(loc=0, scale=0.5, size=num_samples)

# Generate target variable
# Let's assume a linear relationship with some noise
target = 2*X1 + 3*X2 + 1.5*X3 + 0.5*X4 + noise

# Create a DataFrame
data = pd.DataFrame({
    'X1': X1,
    'X2': X2,
    'X3': X3,
    'X4': X4,
    'X5': X5,
    'target': target
})

# Display the first few rows of the dataset
print(data)


           X1        X2        X3  X4        X5    target
0    1.764052  0.821904  0.090332   0 -0.361672  5.869385
1    0.400157  0.700529  3.718693   1  2.153720  8.518218
2    0.978738  0.883078 -3.252644   1  0.847408 -0.271896
3    2.240893  0.966575 -0.269645   1 -0.198720  7.664519
4    1.867558  0.774748 -1.168187   0  1.575307  3.954908
..        ...       ...       ...  ..       ...       ...
995  0.412871  0.503528 -1.909886   1 -1.037881 -0.759389
996 -0.198399  0.620842 -2.940803   1  0.346979 -2.346040
997  0.094192  0.832988  2.020855   0  0.252031  5.938620
998 -1.147611  0.564597  0.992359   1  1.750919  0.747540
999 -0.358114  0.090969  1.153912   1 -0.418401  1.266151

[1000 rows x 6 columns]


In [2]:
# Add constant for intercept
import statsmodels.api as sm

X = sm.add_constant(data.drop('target', axis=1))
y = data['target']

# Apply backward elimination
def backward_elimination(X, y, significance_level=0.05):
    num_features = X.shape[1]
    for i in range(num_features):
        regressor_OLS = sm.OLS(y, X).fit()
        max_p_value = max(regressor_OLS.pvalues)
        if max_p_value > significance_level:
            index_to_remove = np.argmax(regressor_OLS.pvalues)
            X = np.delete(X, index_to_remove, 1)
        else:
            break
    return X, regressor_OLS

X_optimal, regressor_OLS = backward_elimination(X.values, y.values)

# Print summary of the final model
print(regressor_OLS.summary())


                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.984
Model:                            OLS   Adj. R-squared (uncentered):              0.984
Method:                 Least Squares   F-statistic:                          1.536e+04
Date:                Mon, 04 Mar 2024   Prob (F-statistic):                        0.00
Time:                        14:44:53   Log-Likelihood:                         -721.28
No. Observations:                1000   AIC:                                      1451.
Df Residuals:                     996   BIC:                                      1470.
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [3]:
# Create feature matrix X and target vector y
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

X = data.drop('target', axis=1)
y = data['target']

# Define a function to perform backward elimination
def backward_elimination(X, y, significance_level=0.05):
    num_features = X.shape[1]
    for i in range(num_features):
        model = LinearRegression()
        model.fit(X, y)
        p_values = pd.Series(model.coef_, index=X.columns)
        max_p_value = p_values.max()
        if max_p_value > significance_level:
            index_to_remove = p_values.idxmax()
            X = X.drop(index_to_remove, axis=1)
        else:
            break
    return X

# Apply backward elimination
X_optimal = backward_elimination(X, y)

# Fit final model
final_model = LinearRegression()
final_model.fit(X_optimal, y)

# Print coefficients and intercept
print("Intercept:", final_model.intercept_)
print("Coefficients:", final_model.coef_)



Intercept: 1.5760189069635189
Coefficients: [ 0.00813622 -0.06484184]


## **Business Case**: Optimizing Predictive Model for Sales Forecasting
Background:
Our company, XYZ Retail, operates in the highly competitive retail industry, where accurate sales forecasting is crucial for efficient inventory management, resource allocation, and overall business planning. Inaccurate forecasts can lead to overstocking, stockouts, increased carrying costs, and missed revenue opportunities. To enhance our forecasting capabilities, we aim to develop a robust predictive model that can accurately predict future sales based on historical data and relevant predictor variables.

**Problem Statement:**
Currently, our sales forecasting process relies on traditional methods that often overlook the complex relationships between various factors influencing sales performance. We need to improve the accuracy and reliability of our forecasts by leveraging advanced statistical modeling techniques.

**Objective:**
The primary objective of this initiative is to develop a predictive model using backward elimination, a technique that iteratively selects the most significant predictor variables while eliminating irrelevant ones. By doing so, we aim to optimize our predictive model's performance and enhance the accuracy of our sales forecasts.

