In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier



In [2]:
# Load the dataset
data = pd.read_csv('data_ite_v1.csv')

# Separate features and target variables
X = data.drop(columns=['Hbo_Op1','Hbo_Op2', 'Brand_First_Char', 'ResponseCode'], axis=1)
y_brand = data['Brand_First_Char']
y_response = data['ResponseCode']

# Split the dataset into training and testing set
X_train, X_test, y_brand_train, y_brand_test, y_response_train, y_response_test = train_test_split(
    X, y_brand, y_response, test_size=0.2, random_state=42
)

CLUSTERING


In [3]:
# Feature scaling if necessary / Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Perform clustering using KMeans
kmeans = KMeans(n_clusters=3, random_state=42)
cluster_labels = kmeans.fit_predict(X)

data['Cluster_Labels'] = cluster_labels



DIMENSIONALITY REDUCTION

In [4]:
# Perform dimensionality reduction using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

data['PCA_Component_1'] = X_pca[:, 0]
data['PCA_Component_2'] = X_pca[:, 1]

SUPERVISED LEARNING

In [5]:
# SVM Classifier for 'Brand_First_Char'
svm_classifier_brand = SVC(kernel='linear', C=1, random_state=42)
svm_classifier_brand.fit(X_train_scaled, y_brand_train)
svm_predictions_brand = svm_classifier_brand.predict(X_test_scaled)

# Print classification report and accuracy for 'Brand_First_Char'
print("SVM Classification Report for 'Brand_First_Char':", classification_report(y_brand_test, svm_predictions_brand))
print("Accuracy for 'Brand_First_Char':", accuracy_score(y_brand_test, svm_predictions_brand))



# SVM Classifier for 'ResponseCode'
svm_classifier_response = SVC(kernel='linear', C=1, random_state=42)
svm_classifier_response.fit(X_train_scaled, y_response_train)
svm_predictions_response = svm_classifier_response.predict(X_test_scaled)

# Print classification report and accuracy for 'ResponseCode'
print("SVM Classification Report for 'ResponseCode':", classification_report(y_response_test, svm_predictions_response))
print("Accuracy for 'ResponseCode':", accuracy_score(y_response_test, svm_predictions_response))


SVM Classification Report for 'Brand_First_Char':               precision    recall  f1-score   support

           N       0.47      0.42      0.44       312
           P       0.66      0.93      0.78       282
           R       0.46      0.34      0.39       306

    accuracy                           0.55       900
   macro avg       0.53      0.56      0.54       900
weighted avg       0.53      0.55      0.53       900

Accuracy for 'Brand_First_Char': 0.5522222222222222
SVM Classification Report for 'ResponseCode':               precision    recall  f1-score   support

           0       0.57      0.62      0.59       386
           1       0.58      0.70      0.63       391
           2       0.00      0.00      0.00       123

    accuracy                           0.57       900
   macro avg       0.38      0.44      0.41       900
weighted avg       0.49      0.57      0.53       900

Accuracy for 'ResponseCode': 0.5711111111111111


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
# Random Forest Classifier for 'Brand_First_Char'
rf_classifier_brand = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_brand.fit(X_train_scaled, y_brand_train)
rf_predictions_brand = rf_classifier_brand.predict(X_test_scaled)

# Print classification report and accuracy for 'Brand_First_Char'
print("Random Forest Classification Report for 'Brand_First_Char':", classification_report(y_brand_test, rf_predictions_brand))
print("Accuracy for 'Brand_First_Char':", accuracy_score(y_brand_test, rf_predictions_brand))



# Random Forest Classifier for 'ResponseCode'
rf_classifier_response = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_response.fit(X_train_scaled, y_response_train)
rf_predictions_response = rf_classifier_response.predict(X_test_scaled)

# Print classification report and accuracy for 'ResponseCode'
print("\nRandom Forest Classification Report for 'ResponseCode':", classification_report(y_response_test, rf_predictions_response))
print("Accuracy for 'ResponseCode':", accuracy_score(y_response_test, rf_predictions_response))


Random Forest Classification Report for 'Brand_First_Char':               precision    recall  f1-score   support

           N       0.57      0.57      0.57       312
           P       0.90      0.90      0.90       282
           R       0.54      0.53      0.53       306

    accuracy                           0.66       900
   macro avg       0.67      0.67      0.67       900
weighted avg       0.66      0.66      0.66       900

Accuracy for 'Brand_First_Char': 0.6622222222222223

Random Forest Classification Report for 'ResponseCode':               precision    recall  f1-score   support

           0       0.73      0.92      0.81       386
           1       0.83      0.80      0.81       391
           2       0.97      0.28      0.44       123

    accuracy                           0.78       900
   macro avg       0.84      0.67      0.69       900
weighted avg       0.80      0.78      0.76       900

Accuracy for 'ResponseCode': 0.7777777777777778


In [7]:
# KNN Classifier for 'Brand_First_Char'
knn_classifier_brand = KNeighborsClassifier(n_neighbors=3)
knn_classifier_brand.fit(X_train_scaled, y_brand_train)
knn_predictions_brand = knn_classifier_brand.predict(X_test_scaled)

# Print classification report and accuracy for 'Brand_First_Char'
print("KNN Classification Report for 'Brand_First_Char':", classification_report(y_brand_test, knn_predictions_brand))
print("Accuracy for 'Brand_First_Char':", accuracy_score(y_brand_test, knn_predictions_brand))


# KNN Classifier for 'ResponseCode'
knn_classifier_response = KNeighborsClassifier(n_neighbors=3)
knn_classifier_response.fit(X_train_scaled, y_response_train)
knn_predictions_response = knn_classifier_response.predict(X_test_scaled)

# Print classification report and accuracy for 'ResponseCode'
print("\nKNN Classification Report for 'ResponseCode':", classification_report(y_response_test, knn_predictions_response))
print("Accuracy for 'ResponseCode':", accuracy_score(y_response_test, knn_predictions_response))


KNN Classification Report for 'Brand_First_Char':               precision    recall  f1-score   support

           N       0.36      0.44      0.40       312
           P       0.52      0.51      0.52       282
           R       0.39      0.31      0.34       306

    accuracy                           0.42       900
   macro avg       0.42      0.42      0.42       900
weighted avg       0.42      0.42      0.42       900

Accuracy for 'Brand_First_Char': 0.4166666666666667

KNN Classification Report for 'ResponseCode':               precision    recall  f1-score   support

           0       0.46      0.56      0.51       386
           1       0.50      0.52      0.51       391
           2       0.04      0.01      0.01       123

    accuracy                           0.47       900
   macro avg       0.33      0.36      0.34       900
weighted avg       0.42      0.47      0.44       900

Accuracy for 'ResponseCode': 0.4666666666666667


In [16]:
param_grid_logistic_regression = {
    'C': [0.1, 0.5, 1],
}

# Logistic Regression Classifier for 'Brand_First_Char'
logistic_regression_brand = LogisticRegression(random_state=42)
grid_search_logistic_regression_brand = GridSearchCV(estimator=logistic_regression_brand, param_grid=param_grid_logistic_regression, cv=5, scoring='accuracy')
grid_search_logistic_regression_brand.fit(X_train_scaled, y_brand_train)

# Get the best Logistic Regression model for 'Brand_First_Char'
best_logistic_regression_brand = grid_search_logistic_regression_brand.best_estimator_
logistic_regression_predictions_brand = best_logistic_regression_brand.predict(X_test_scaled)

# Print the best hyperparameters and classification report for 'Brand_First_Char' using Logistic Regression
print("Best Logistic Regression Hyperparameters for 'Brand_First_Char':", grid_search_logistic_regression_brand.best_params_)
print("Logistic Regression Classification Report for 'Brand_First_Char':", classification_report(y_brand_test, logistic_regression_predictions_brand))
print("Accuracy for 'Brand_First_Char' using Logistic Regression:", accuracy_score(y_brand_test, logistic_regression_predictions_brand))




# Logistic Regression Classifier for 'ResponseCode'
logistic_regression_response = LogisticRegression(random_state=42)
grid_search_logistic_regression_response = GridSearchCV(estimator=logistic_regression_response, param_grid=param_grid_logistic_regression, cv=5, scoring='accuracy')
grid_search_logistic_regression_response.fit(X_train_scaled, y_response_train)

# Get the best Logistic Regression model for 'ResponseCode'
best_logistic_regression_response = grid_search_logistic_regression_response.best_estimator_
logistic_regression_predictions_response = best_logistic_regression_response.predict(X_test_scaled)

# Print the best hyperparameters and classification report for 'ResponseCode' using Logistic Regression
print("\nBest Logistic Regression Hyperparameters for 'ResponseCode':", grid_search_logistic_regression_response.best_params_)
print("Logistic Regression Classification Report for 'ResponseCode':", classification_report(y_response_test, logistic_regression_predictions_response))
print("Accuracy for 'ResponseCode' using Logistic Regression:", accuracy_score(y_response_test, logistic_regression_predictions_response))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Logistic Regression Hyperparameters for 'Brand_First_Char': {'C': 1}
Logistic Regression Classification Report for 'Brand_First_Char':               precision    recall  f1-score   support

           N       0.46      0.37      0.41       312
           P       0.65      0.93      0.77       282
           R       0.46      0.38      0.41       306

    accuracy                           0.55       900
   macro avg       0.53      0.56      0.53       900
weighted avg       0.52      0.55      0.52       900

Accuracy for 'Brand_First_Char' using Logistic Regression: 0.5466666666666666


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Best Logistic Regression Hyperparameters for 'ResponseCode': {'C': 1}
Logistic Regression Classification Report for 'ResponseCode':               precision    recall  f1-score   support

           0       0.57      0.61      0.59       386
           1       0.58      0.72      0.64       391
           2       0.00      0.00      0.00       123

    accuracy                           0.57       900
   macro avg       0.38      0.44      0.41       900
weighted avg       0.50      0.57      0.53       900

Accuracy for 'ResponseCode' using Logistic Regression: 0.5711111111111111


In [17]:
# Decision Tree Classifier for 'Brand_First_Char'
decision_tree_brand = DecisionTreeClassifier(random_state=42)
decision_tree_brand.fit(X_train_scaled, y_brand_train)
decision_tree_predictions_brand = decision_tree_brand.predict(X_test_scaled)

# Print classification report and accuracy for 'Brand_First_Char'
print("\nDecision Tree Classification Report for 'Brand_First_Char':", classification_report(y_brand_test, decision_tree_predictions_brand))
print("Accuracy for 'Brand_First_Char' using Decision Tree:", accuracy_score(y_brand_test, decision_tree_predictions_brand))



# Decision Tree Classifier for 'ResponseCode'
decision_tree_response = DecisionTreeClassifier(random_state=42)
decision_tree_response.fit(X_train_scaled, y_response_train)
decision_tree_predictions_response = decision_tree_response.predict(X_test_scaled)

# Print classification report and accuracy for 'ResponseCode'
print("\nDecision Tree Classification Report for 'ResponseCode':", classification_report(y_response_test, decision_tree_predictions_response))
print("Accuracy for 'ResponseCode' using Decision Tree:", accuracy_score(y_response_test, decision_tree_predictions_response))


Decision Tree Classification Report for 'Brand_First_Char':               precision    recall  f1-score   support

           N       0.52      0.49      0.51       312
           P       0.83      0.82      0.82       282
           R       0.50      0.53      0.51       306

    accuracy                           0.61       900
   macro avg       0.61      0.61      0.61       900
weighted avg       0.61      0.61      0.61       900

Accuracy for 'Brand_First_Char' using Decision Tree: 0.6077777777777778

Decision Tree Classification Report for 'ResponseCode':               precision    recall  f1-score   support

           0       0.73      0.74      0.73       386
           1       0.77      0.72      0.75       391
           2       0.38      0.43      0.40       123

    accuracy                           0.69       900
   macro avg       0.62      0.63      0.63       900
weighted avg       0.70      0.69      0.69       900

Accuracy for 'ResponseCode' using Decision Tree:

In [18]:
# Gaussian Naive Bayes Classifier for 'Brand_First_Char'
naive_bayes_brand = GaussianNB()
naive_bayes_brand.fit(X_train_scaled, y_brand_train)
naive_bayes_predictions_brand = naive_bayes_brand.predict(X_test_scaled)

# Print classification report and accuracy for 'Brand_First_Char'
print("\nGaussian Naive Bayes Classification Report for 'Brand_First_Char':", classification_report(y_brand_test, naive_bayes_predictions_brand))
print("Accuracy for 'Brand_First_Char' using Gaussian Naive Bayes:", accuracy_score(y_brand_test, naive_bayes_predictions_brand))

# Gaussian Naive Bayes Classifier for 'ResponseCode'
naive_bayes_response = GaussianNB()
naive_bayes_response.fit(X_train_scaled, y_response_train)
naive_bayes_predictions_response = naive_bayes_response.predict(X_test_scaled)

# Print classification report and accuracy for 'ResponseCode'
print("\nGaussian Naive Bayes Classification Report for 'ResponseCode':", classification_report(y_response_test, naive_bayes_predictions_response))
print("Accuracy for 'ResponseCode' using Gaussian Naive Bayes:", accuracy_score(y_response_test, naive_bayes_predictions_response))



Gaussian Naive Bayes Classification Report for 'Brand_First_Char':               precision    recall  f1-score   support

           N       0.48      0.50      0.49       312
           P       0.60      0.84      0.70       282
           R       0.46      0.27      0.34       306

    accuracy                           0.53       900
   macro avg       0.51      0.54      0.51       900
weighted avg       0.51      0.53      0.51       900

Accuracy for 'Brand_First_Char' using Gaussian Naive Bayes: 0.53

Gaussian Naive Bayes Classification Report for 'ResponseCode':               precision    recall  f1-score   support

           0       0.58      0.48      0.53       386
           1       0.56      0.61      0.59       391
           2       0.16      0.20      0.18       123

    accuracy                           0.50       900
   macro avg       0.43      0.43      0.43       900
weighted avg       0.51      0.50      0.50       900

Accuracy for 'ResponseCode' using Gaussia

In [21]:
# MLP Classifier for 'Brand_First_Char'
mlp_classifier_brand = MLPClassifier(random_state=42)
mlp_classifier_brand.fit(X_train_scaled, y_brand_train)
mlp_predictions_brand = mlp_classifier_brand.predict(X_test_scaled)

# Print classification report and accuracy for 'Brand_First_Char'
print("\nMLP Classification Report for 'Brand_First_Char':", classification_report(y_brand_test, mlp_predictions_brand))
print("Accuracy for 'Brand_First_Char' using MLP:", accuracy_score(y_brand_test, mlp_predictions_brand))

# MLP Classifier for 'ResponseCode'
mlp_classifier_response = MLPClassifier(random_state=42)
mlp_classifier_response.fit(X_train_scaled, y_response_train)
mlp_predictions_response = mlp_classifier_response.predict(X_test_scaled)

# Print classification report and accuracy for 'ResponseCode'
print("\nMLP Classification Report for 'ResponseCode':", classification_report(y_response_test, mlp_predictions_response))
print("Accuracy for 'ResponseCode' using MLP:", accuracy_score(y_response_test, mlp_predictions_response))




MLP Classification Report for 'Brand_First_Char':               precision    recall  f1-score   support

           N       0.50      0.50      0.50       312
           P       0.74      0.82      0.78       282
           R       0.48      0.43      0.46       306

    accuracy                           0.58       900
   macro avg       0.57      0.58      0.58       900
weighted avg       0.57      0.58      0.57       900

Accuracy for 'Brand_First_Char' using MLP: 0.5766666666666667

MLP Classification Report for 'ResponseCode':               precision    recall  f1-score   support

           0       0.56      0.60      0.58       386
           1       0.57      0.62      0.60       391
           2       0.20      0.10      0.13       123

    accuracy                           0.54       900
   macro avg       0.45      0.44      0.44       900
weighted avg       0.52      0.54      0.53       900

Accuracy for 'ResponseCode' using MLP: 0.5433333333333333


