In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score, matthews_corrcoef
from xgboost import XGBClassifier

In [None]:
# A. Data Load
# Load the dataset as CSV file 
data = pd.read_csv('advertsuccess.csv')
data.shape


In [None]:
# B. Exploratory Data Analysis

# Information of data 
data.info()

# Statistical summary data
print(data.describe())


In [None]:
# Correlation matrix
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".4f")
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Other graphs (pairplot for numeric values)
sns.pairplot(data)
plt.show()


In [None]:
# Other graphs (scatter plots, histograms, etc.)

sns.histplot(data['airtime'], bins=20, kde=True)
plt.title("Histogram of Airtime")
plt.show()


sns.scatterplot(x='ratings', y='netgain', data=data)
plt.title("Scatter Plot of Ratings vs. Net Gain")
plt.show()

sns.scatterplot(x='genre', y='netgain', data=data)
plt.title("Scatter Plot of Genre vs. Net Gain")
plt.show()


# Relationships
sns.jointplot(x=data['industry'],y=data['netgain'])
sns.jointplot(x = data['target_sex'],y = data['netgain'])
sns.jointplot(x=data['relationship_status'],y=data['netgain'])


In [None]:
# Missing for outliers()
sns.boxplot(x=data['netgain'])
plt.show()

sns.boxplot(x=data['ratings'])
plt.show()

sns.boxplot(x=data['airtime'])
plt.show()


In [None]:
# C. Data Preparation
# 2 candidate checked with isnull and sum
print(data.isnull().sum())

In [None]:
# Checking for missing values
data = data.dropna()
data.shape

In [None]:
# Categorical values encoding
data['targeted_sex'] = data['targeted_sex'].apply({'male':0, 'female':1}.get)
data['genre'] = data['genre'].apply({'horror':1, 'comedy':2}.get)
data['industry'] = data['industry'].apply({'pharma':1, 'cinema':2, 'food':3, 
'car':4}.get)

In [None]:
# Feature Engineering
data['genre_airlocation_interaction'] = data['genre'] * data['airlocation']

In [None]:
# Split dataset into training and test sets; isolate the test set
X = data.drop('netgain', axis=1)
y = data['netgain']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
# Standart Scaling
scaler = StandardScaler()
data[['ratings', 'average_runtime']] = scaler.fit_transform(data[['ratings', 'average_runtime']])

In [None]:
# D. Model training

# Random Forest Classifier

rf_model = RandomForestClassifier(random_state=0)
rf_model.fit(X_train, y_train)

lr_model = LogisticRegression(random_state=0)
lr_model.fit(X_train, y_train)

xgb_model = XGBClassifier(random_state=0)
xgb_model.fit(X_train, y_train)

svm_model = SVC(random_state=0)
svm_model.fit(X_train, y_train)

In [None]:
# E. Model evaluation
# Predict on the test set
# Confusion matrix 
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    return accuracy, report, conf_matrix, roc_auc, mcc

# Evaluate Random Forest model
rf_accuracy, rf_report, rf_conf_matrix, rf_roc_auc, rf_mcc = evaluate_model(rf_model, X_test, y_test)
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:")
print(rf_report)
print("Random Forest Confusion Matrix:")
print(rf_conf_matrix)
print("Random Forest ROC AUC:", rf_roc_auc)
print("Random Forest Matthews Correlation Coefficient:", rf_mcc)

# Evaluate Logistic Regression model
lr_accuracy, lr_report, lr_conf_matrix, lr_roc_auc, lr_mcc = evaluate_model(lr_model, X_test, y_test)
print("\nLogistic Regression Accuracy:", lr_accuracy)
print("Logistic Regression Classification Report:")
print(lr_report)
print("Logistic Regression Confusion Matrix:")
print(lr_conf_matrix)
print("Logistic Regression ROC AUC:", lr_roc_auc)
print("Logistic Regression Matthews Correlation Coefficient:", lr_mcc)

# Evaluate XGBoost model
xgb_accuracy, xgb_report, xgb_conf_matrix, xgb_roc_auc, xgb_mcc = evaluate_model(xgb_model, X_test, y_test)
print("\nXGBoost Accuracy:", xgb_accuracy)
print("XGBoost Classification Report:")
print(xgb_report)
print("XGBoost Confusion Matrix:")
print(xgb_conf_matrix)
print("XGBoost ROC AUC:", xgb_roc_auc)
print("XGBoost Matthews Correlation Coefficient:", xgb_mcc)

# Evaluate SVM model
svm_accuracy, svm_report, svm_conf_matrix, svm_roc_auc, svm_mcc = evaluate_model(svm_model, X_test, y_test)
print("\nSVM Accuracy:", svm_accuracy)
print("SVM Classification Report:")
print(svm_report)
print("SVM Confusion Matrix:")
print(svm_conf_matrix)
print("SVM ROC AUC:", svm_roc_auc)
print("SVM Matthews Correlation Coefficient:", svm_mcc)

