In [None]:
"""Data Cleaning and Preprocessing"""
import pandas as pd
import numpy as np

# Read data from Excel file
data = pd.read_excel("daily_offers.xlsx")

# Check for missing values
print(data.isnull().sum())

# Replace missing values with median
data.fillna(data.median(), inplace=True)

# Remove duplicates
data.drop_duplicates(inplace=True)

# Normalize data
data = (data - data.mean()) / data.std()


In [None]:

"""Explatory Data Analysis"""
import seaborn as sns
import matplotlib.pyplot as plt

# Scatterplot of two variables
sns.scatterplot(x="age", y="income", data=data)
plt.show()

# Histogram of a variable
sns.histplot(x="income", data=data)
plt.show()

# Boxplot of a variable
sns.boxplot(x="education", y="income", data=data)
plt.show()

# Correlation matrix of all variables
sns.heatmap(data.corr(), annot=True)
plt.show()



In [None]:
"""Feature Engineering"""

# Create dummy variables for categorical variable
data = pd.get_dummies(data, columns=["education"])

# Create interaction term between two variables
data["age_income_interaction"] = data["age"] * data["income"]

# Scale features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data[["age", "income"]] = scaler.fit_transform(data[["age", "income"]])


In [None]:
"""Feature Selection"""
# Perform ANOVA test to select features
from scipy.stats import f_oneway

for feature in data.columns:
    if feature != "target":
        f, p = f_oneway(
            data[data["target"] == 0][feature], data[data["target"] == 1][feature]
        )
        if p < 0.05:
            print(f"Significant feature: {feature}")

# Perform LASSO regression to select features
from sklearn.linear_model import LassoCV

X = data.drop("target", axis=1)
y = data["target"]
lasso = LassoCV(cv=5).fit(X, y)
coef = pd.Series(lasso.coef_, index=X.columns)
print(f"Lasso selected {sum(coef != 0)} features")


In [None]:
"""Regression Model"""
# Fit linear regression model
from sklearn.linear_model import LinearRegression

X = data.drop("target", axis=1)
y = data["target"]
model = LinearRegression().fit(X, y)

# Fit logistic regression model
from sklearn.linear_model import LogisticRegression

X = data.drop("target", axis=1)
y = data["target"]
model = LogisticRegression().fit(X, y)

# Fit random forest regression model
from sklearn.ensemble import RandomForestRegressor

X = data.drop("target", axis=1)
y = data["target"]
model = RandomForestRegressor().fit(X, y)


In [None]:
"""Model Evaluation"""
# Evaluate linear regression model
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

y_pred = model.predict(X)
print(f"R-squared: {r2_score(y, y_pred)}")
print(f"Mean squared error: {mean_squared_error(y, y_pred)}")
print(f"Mean absolute error: {mean_absolute_error(y, y_pred)}")


In [None]:
# Evaluate logistic regression model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = model.predict(X)
print(f"Accuracy: {accuracy_score(y, y_pred)}")
print(f"Confusion matrix: {confusion_matrix(y, y_pred)}")
print(f"Classification report: {classification_report(y, y_pred)}")


In [None]:
"""Evaluate random forest regression model"""

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

y_pred = model.predict(X)
print(f"R-squared: {r2_score(y, y_pred)}")
print(f"Mean squared error: {mean_squared_error(y, y_pred)}")
print(f"Mean absolute error: {mean_absolute_error(y, y_pred)}")


In [None]:
"""Visualize Residuals"""
sns.scatterplot(x=y_pred, y=y - y_pred)
plt.show()
