In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm

# Generate the dataset
data = {
    "Page Views": np.random.poisson(5, 1000),  # Average page views per session
    "Time Spent on Site": np.random.exponential(10, 1000),  # Average time in minutes
    "Marketing Engagement": np.random.binomial(1, 0.7, 1000),  # Probability of engaging with marketing
    "Discount Offered": np.random.binomial(1, 0.4, 1000)  # Probability of being offered a discount
}

# Create DataFrame
df_full = pd.DataFrame(data)

# Function to simulate purchase decision
def simulate_purchase(row):
    score = (row['Page Views'] * 0.1) + (row['Time Spent on Site'] * 0.05) + \
            (row['Marketing Engagement'] * 2) + (row['Discount Offered'] * 1.5)
    # Logistic function to return binary outcome
    return 1 if np.random.rand() < 1 / (1 + np.exp(-score)) else 0

# Apply the function to create a Purchase column
df_full['Purchase'] = df_full.apply(simulate_purchase, axis=1)

# Export the dataset to an Excel file
excel_file_path = 'Full_Dataset_Logistic_Regression.xlsx'
df_full.to_excel(excel_file_path, index=False)

# Apply Logistic Regression Model on the full dataset
X_full = df_full.drop('Purchase', axis=1)
y_full = df_full['Purchase']

model_full = LogisticRegression()
model_full.fit(X_full, y_full)

# Get coefficients and intercept from the model
coefficients = model_full.coef_
intercept = model_full.intercept_

# Display the first five rows of the dataset
# first_five_rows = df_full.head()

# Add a constant to the predictor variables set for statsmodels
X_full_with_const = sm.add_constant(X_full)

# Fit the model using statsmodels to get detailed statistics
logit_model = sm.Logit(y_full, X_full_with_const)
fitted_model = logit_model.fit()

# Summary of the model
model_summary = fitted_model.summary()

model_summary


Optimization terminated successfully.
         Current function value: 0.238900
         Iterations 8


0,1,2,3
Dep. Variable:,Purchase,No. Observations:,1000.0
Model:,Logit,Df Residuals:,995.0
Method:,MLE,Df Model:,4.0
Date:,"Fri, 14 Jun 2024",Pseudo R-squ.:,0.2279
Time:,10:01:45,Log-Likelihood:,-238.9
converged:,True,LL-Null:,-309.42
Covariance Type:,nonrobust,LLR p-value:,1.683e-29

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0141,0.339,0.042,0.967,-0.650,0.679
Page Views,0.0579,0.058,1.007,0.314,-0.055,0.171
Time Spent on Site,0.0514,0.015,3.382,0.001,0.022,0.081
Marketing Engagement,2.4211,0.274,8.824,0.000,1.883,2.959
Discount Offered,1.4096,0.295,4.771,0.000,0.830,1.989
