In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.utils import resample



data=pd.read_excel("Email_Marketing_Campaign_Dataset_Rounded.xlsx")

numerical_columns=data.select_dtypes(include=['number']).columns


# outlier detection
def outlier_detection(df,colname): #creating function to finding outiers in each columns
    #calculate the Q1,Q3 and IQR
    q1=df[colname].quantile(0.25)
    q3=df[colname].quantile(0.75)
    iqr=q3-q1

    upper_extreme=q3+(1.5 * iqr)
    lower_extreme=q1-(1.5 * iqr)

    # Identify outliers
    outliers = df[(df[colname] < lower_extreme) | (df[colname] > upper_extreme)]


    return lower_extreme,upper_extreme,q1,q3,outliers

#handling outliers by capping them
def handle_outliers(df,column):
    lower_extreme, upper_extreme,_,_,_=outlier_detection(data,column)
    df[column]=df[column].clip(lower=lower_extreme,upper=upper_extreme)
    #Any value less than the lower_extreme will be replaced by lower_extreme.
    #Any value greater than the upper_extreme will be replaced by upper_extreme.
    return df
    
for column in numerical_columns:
    data=handle_outliers(data,column)




model_data=data.copy() #creating a copy of the original data to ensure that any transformations do not affect the original dataset. 


##advanced preprocessing 

#age binning (bins the Customer_Age column into four age groups: Teen, Young Adult, Middle Aged, and Senior.)
bins=[0,18,35,50,100]
labels=['teen', 'young_adult', 'middle_aged', 'senior ']
#Ages 0–18 fall into "Teen."
#Ages 19–35 fall into Young Adult
#Ages 36–50 fall into Middle Aged
#Ages 51–100 fall into Senior
model_data['Age_group']=pd.cut(data['Customer_Age'],bins=bins,labels=labels)


#remove the col customer age as we have created age groups col
model_data=model_data.drop(columns=['Customer_Age'])


#encode the age_group col

#apply label encoding
#label_encoder=LabelEncoder()
#model_data['Age_group']=label_encoder.fit_transform(model_data['Age_group'])



# Split the Data into Training and Testing Sets


#split the data in to features(x) and target(y=opened_previous_emails)
x=model_data.drop(columns=['Opened_Previous_Emails']) # all cols except target variable
y=model_data['Opened_Previous_Emails'] #target variable


#standardization (only numerical coloumn not the binary ones)
#(seperating binary columns and numerical coloumns)
binary_columns=['Opened_Previous_Emails','Clicked_Previous_Emails','Device_Type','Age_group']
#seperate numerical cols to standardize
numerical_columns=[col for col in x.columns if col not in binary_columns]


# Apply StandardScaler to numerical columns only (exclude binary columns)
scaler = StandardScaler()
x[numerical_columns]=scaler.fit_transform(x[numerical_columns])
x #standized features table


x_bootstrapped,y_bootstrapped=resample(x,y,replace=True,n_samples=len(x)*2,random_state=42)

#split the data into training and testing data
x_train,x_test,y_train,y_test=train_test_split(x_bootstrapped,y_bootstrapped,test_size=0.3,random_state=42)

#Random Forest Implementation: (with hyper parameter tuning)



# Best parameters obtained from hyperparameter tuning
best_params = { 'n_estimators': 369, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 2, 'bootstrap': False }

#initilaize and train the random forest model
rf_model=RandomForestClassifier(**best_params,random_state=42)


rf_model.fit(x_train, y_train)


# Predict using the best model
y_pred_rf = rf_model.predict(x_test)



import pickle
filename = 'trained_rf_model.sav'

pickle.dump(rf_model, open(filename, 'wb'))


# Save scaler and label encoder
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Load the saved model
loaded_model = pickle.load(open('trained_rf_model.sav', 'rb'))



Prediction: [0]


