# DTSC Project 2: 
* Name: Jason Ortiz
* NYIT ID: 1258640
* Due: 12/12/2022 @ 11:59PM
* Professor: Kiran Balgani

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sb

## Data Prepping

In [None]:
original_spam_df = pd.read_csv('spam.csv', sep=',')

In [None]:
original_spam_df

In [None]:
original_spam_df.shape

In [None]:
original_spam_df.describe()

In [None]:
original_spam_df.notnull().count()

In [None]:
original_spam_df["Class"].unique()

Indicates this is a Binary Classification Problem ^

In [None]:
non_class_features = original_spam_df.columns.delete(-1)
non_class_features

In [None]:
spam_training_set, spam_test_set = train_test_split(original_spam_df, test_size=0.7826559, random_state=99)

In [None]:
spam_training_data, spam_training_target = spam_training_set[non_class_features], spam_training_set['Class']
spam_test_data, spam_test_target = spam_test_set[non_class_features], spam_test_set['Class']

In [None]:
print(f'{spam_training_data.shape = }\n{spam_test_data.shape = }')

In [None]:
spam_training_data.head()
#spam_training_target.head()
#spam_test_data.head()
#spam_test_target.head()

In [None]:
def rate_classifier(test_target_data, test_target_predict):
    cm = confusion_matrix(test_target_data, test_target_predict)

    terminology = ['True Negative', 'False Positive', 'False Negative', 'True Positive']
    counts = ["{0:0.0f}".format(value) for value in cm.flatten()]
    percentages = ["{0:0.2%}".format(value) for value in cm.flatten()/np.sum(cm)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(terminology, counts, percentages)]
    labels = np.asarray(labels).reshape(2,2)

    plt.figure(figsize=(10,7))
    sb.heatmap(cm/np.sum(cm), annot=labels, fmt='', cmap='Blues')
    plt.xlabel('Prediction')
    plt.ylabel('Truth')

    print("Classification Report\n", classification_report(test_target_data, test_target_predict))

    print("Accuracy Score\n", accuracy_score(test_target_data, test_target_predict))

## Fused Majority Voting Rule Classifier

In [None]:
decision_tree_classifier = DecisionTreeClassifier().fit(spam_training_data, spam_training_target)
gauss_nb_MJVT = GaussianNB().fit(spam_training_data, spam_training_target)
# max_iter of 1191 required for Logistic Regression to Converge
logistic_regression_MJVT = LogisticRegression(max_iter=1191).fit(spam_training_data, spam_training_target)

In [None]:
majority_voting_classifier = VotingClassifier(estimators=[('DT', decision_tree_classifier), ('GNB', gauss_nb_MJVT), ('LR', logistic_regression_MJVT)], voting='hard')
majority_voting_classifier = majority_voting_classifier.fit(spam_training_data, spam_training_target)
majority_voting_classifier_predict = majority_voting_classifier.predict(spam_test_data)

In [None]:
rate_classifier(spam_test_target, majority_voting_classifier_predict)

## Task 1 
Compare the accuracies of the fused model with AdaBoost Ensemble with Decision Tree as the base learner. Train the classifiers using the first 1000 instances and use the remaining 3601 for testing. [25 points]

In [None]:
ada_boost_classifier = AdaBoostClassifier(base_estimator=decision_tree_classifier).fit(spam_training_data, spam_training_target)
ada_boost_classifier_predict = ada_boost_classifier.predict(spam_test_data)

In [None]:
rate_classifier(spam_test_target, ada_boost_classifier_predict)

In [None]:
rate_classifier(spam_test_target, majority_voting_classifier_predict)

The Majority Voting Classifier performed better than the ADA_Boost Classifier, with a higher accuracy and lower False Positive/Negative Rate

## Task 2 
Compare the accuracies of the fused model with Random Forest (with 1000 base learners). Train the classifiers using the first 1000 instances and use the remaining 3601 for testing. [25 points]

In [None]:
random_forest_classifier = RandomForestClassifier(n_estimators=1000).fit(spam_training_data, spam_training_target)
random_forest_classifier_predict = random_forest_classifier.predict(spam_test_data)

In [None]:
rate_classifier(spam_test_target, random_forest_classifier_predict)

In [None]:
rate_classifier(spam_test_target, majority_voting_classifier_predict)

The Random Forest Classifier performed better than the Majority Voting Classifier, with a higher accuracy and lower False Positive, however it had a higher False Negative Rate by 0.97%

## Task 3 
Study the impact of training sample size on the accuracies of the fused classifier and the AdaBoost Ensemble with Decision Tree as the base learner. Compare their accuracies with the following training-test splits: 
* 50%-50%
* 60%-40%
* 70%-30%
* 80%-20%
* [50 points]

### Task 3.1
* 50%-50% split

Majority Voting 50%-50%

In [None]:
mv_50_50_train_set, mv_50_50_test_set = train_test_split(original_spam_df, test_size=0.50, random_state=99)

In [None]:
mv_50_50_training_data, mv_50_50_training_target = mv_50_50_train_set[non_class_features], mv_50_50_train_set['Class']
mv_50_50_test_data, mv_50_50_test_target = mv_50_50_test_set[non_class_features], mv_50_50_test_set['Class']

In [None]:
print(f'{mv_50_50_training_data.shape = }\n{mv_50_50_test_data.shape = }')

In [None]:
decision_tree_mv_50_50 = DecisionTreeClassifier().fit(mv_50_50_training_data, mv_50_50_training_target)
gauss_nb_mv_50_50 = GaussianNB().fit(mv_50_50_training_data, mv_50_50_training_target)
logistic_regression_mv_50_50 = LogisticRegression(max_iter=5000).fit(mv_50_50_training_data, mv_50_50_training_target)

In [None]:
mv_50_50_classifier = VotingClassifier(estimators=[('DT', decision_tree_mv_50_50), ('GNB', gauss_nb_mv_50_50), ('LR', logistic_regression_mv_50_50)], voting='hard')
mv_50_50_classifier = mv_50_50_classifier.fit(mv_50_50_training_data, mv_50_50_training_target)
mv_50_50_classifier_predict = mv_50_50_classifier.predict(mv_50_50_test_data)

ADA Boost 50%-50%

In [None]:
ada_50_50_train_set, ada_50_50_test_set = train_test_split(original_spam_df, test_size=0.50, random_state=99)

In [None]:
ada_50_50_training_data, ada_50_50_training_target = ada_50_50_train_set[non_class_features], ada_50_50_train_set['Class']
ada_50_50_test_data, ada_50_50_test_target = ada_50_50_test_set[non_class_features], ada_50_50_test_set['Class']

In [None]:
print(f'{ada_50_50_training_data.shape = }\n{ada_50_50_test_data.shape = }')

In [None]:
ada_boost_50_50_classifier = AdaBoostClassifier(base_estimator=decision_tree_classifier).fit(ada_50_50_training_data, ada_50_50_training_target)
ada_boost_50_50_classifier_predict = ada_boost_50_50_classifier.predict(ada_50_50_test_data)

#### 50%-50% split Comparision:
* Majority Voting [(1) Decision Tree, (2) Gaussian Naïve Bayes, and (3) Logistic Regression]
* ADA Boost [Decision Tree]

In [None]:
print("Majority Voting 50%-50% Split")
rate_classifier(mv_50_50_test_target, mv_50_50_classifier_predict)

In [None]:
print("ADA Boost 50%-50% Split")
rate_classifier(ada_50_50_test_target, ada_boost_50_50_classifier_predict)

### Task 3.2
* 60%-40% split

Majority Voting 60%-40%

In [None]:
mv_60_40_train_set, mv_60_40_test_set = train_test_split(original_spam_df, test_size=0.40, random_state=99)

In [None]:
mv_60_40_training_data, mv_60_40_training_target = mv_60_40_train_set[non_class_features], mv_60_40_train_set['Class']
mv_60_40_test_data, mv_60_40_test_target = mv_60_40_test_set[non_class_features], mv_60_40_test_set['Class']

In [None]:
print(f'{mv_60_40_training_data.shape = }\n{mv_60_40_test_data.shape = }')

In [None]:
decision_tree_mv_60_40 = DecisionTreeClassifier().fit(mv_60_40_training_data, mv_60_40_training_target)
gauss_nb_mv_60_40 = GaussianNB().fit(mv_60_40_training_data, mv_60_40_training_target)
logistic_regression_mv_60_40 = LogisticRegression(max_iter=5000).fit(mv_60_40_training_data, mv_60_40_training_target)

In [None]:
mv_60_40_classifier = VotingClassifier(estimators=[('DT', decision_tree_mv_60_40), ('GNB', gauss_nb_mv_60_40), ('LR', logistic_regression_mv_60_40)], voting='hard')
mv_60_40_classifier = mv_60_40_classifier.fit(mv_60_40_training_data, mv_60_40_training_target)
mv_60_40_classifier_predict = mv_60_40_classifier.predict(mv_60_40_test_data)

ADA Boost 60%-40%

In [None]:
ada_60_40_train_set, ada_60_40_test_set = train_test_split(original_spam_df, test_size=0.40, random_state=99)

In [None]:
ada_60_40_training_data, ada_60_40_training_target = ada_60_40_train_set[non_class_features], ada_60_40_train_set['Class']
ada_60_40_test_data, ada_60_40_test_target = ada_60_40_test_set[non_class_features], ada_60_40_test_set['Class']

In [None]:
print(f'{ada_60_40_training_data.shape = }\n{ada_60_40_test_data.shape = }')

In [None]:
ada_boost_60_40_classifier = AdaBoostClassifier(base_estimator=decision_tree_classifier).fit(ada_60_40_training_data, ada_60_40_training_target)
ada_boost_60_40_classifier_predict = ada_boost_60_40_classifier.predict(ada_60_40_test_data)

#### 60%-40% split Comparision:
* Majority Voting [(1) Decision Tree, (2) Gaussian Naïve Bayes, and (3) Logistic Regression]
* ADA Boost [Decision Tree]

In [None]:
print("Majority Voting 60%-40% Split")
rate_classifier(mv_60_40_test_target, mv_60_40_classifier_predict)

In [None]:
print("ADA Boost 60%-40% Split")
rate_classifier(ada_60_40_test_target, ada_boost_60_40_classifier_predict)

### Task 3.3
* 70%-30% split

Majority Voting 70%-70%

In [None]:
mv_70_30_train_set, mv_70_30_test_set = train_test_split(original_spam_df, test_size=0.30, random_state=99)

In [None]:
mv_70_30_training_data, mv_70_30_training_target = mv_70_30_train_set[non_class_features], mv_70_30_train_set['Class']
mv_70_30_test_data, mv_70_30_test_target = mv_70_30_test_set[non_class_features], mv_70_30_test_set['Class']

In [None]:
print(f'{mv_70_30_training_data.shape = }\n{mv_70_30_test_data.shape = }')

In [None]:
decision_tree_mv_70_30 = DecisionTreeClassifier().fit(mv_70_30_training_data, mv_70_30_training_target)
gauss_nb_mv_70_30 = GaussianNB().fit(mv_70_30_training_data, mv_70_30_training_target)
logistic_regression_mv_70_30 = LogisticRegression(max_iter=5000).fit(mv_70_30_training_data, mv_70_30_training_target)

In [None]:
mv_70_30_classifier = VotingClassifier(estimators=[('DT', decision_tree_mv_70_30), ('GNB', gauss_nb_mv_70_30), ('LR', logistic_regression_mv_70_30)], voting='hard')
mv_70_30_classifier = mv_70_30_classifier.fit(mv_70_30_training_data, mv_70_30_training_target)
mv_70_30_classifier_predict = mv_70_30_classifier.predict(mv_70_30_test_data)

ADA Boost 70%-30%

In [None]:
ada_70_30_train_set, ada_70_30_test_set = train_test_split(original_spam_df, test_size=0.30, random_state=99)

In [None]:
ada_70_30_training_data, ada_70_30_training_target = ada_70_30_train_set[non_class_features], ada_70_30_train_set['Class']
ada_70_30_test_data, ada_70_30_test_target = ada_70_30_test_set[non_class_features], ada_70_30_test_set['Class']

In [None]:
print(f'{ada_70_30_training_data.shape = }\n{ada_70_30_test_data.shape = }')

In [None]:
ada_boost_70_30_classifier = AdaBoostClassifier(base_estimator=decision_tree_classifier).fit(ada_70_30_training_data, ada_70_30_training_target)
ada_boost_70_30_classifier_predict = ada_boost_70_30_classifier.predict(ada_70_30_test_data)

#### 70%-30% split Comparision:
* Majority Voting [(1) Decision Tree, (2) Gaussian Naïve Bayes, and (3) Logistic Regression]
* ADA Boost [Decision Tree]

In [None]:
print("Majority Voting 70%-30% Split")
rate_classifier(mv_70_30_test_target, mv_70_30_classifier_predict)

In [None]:
print("ADA Boost 70%-30% Split")
rate_classifier(ada_70_30_test_target, ada_boost_70_30_classifier_predict)

### Task 3.4
* 80%-20% split

Majority Voting 80%-20%

In [None]:
mv_80_20_train_set, mv_80_20_test_set = train_test_split(original_spam_df, test_size=0.20, random_state=99)

In [None]:
mv_80_20_training_data, mv_80_20_training_target = mv_80_20_train_set[non_class_features], mv_80_20_train_set['Class']
mv_80_20_test_data, mv_80_20_test_target = mv_80_20_test_set[non_class_features], mv_80_20_test_set['Class']

In [None]:
print(f'{mv_80_20_training_data.shape = }\n{mv_80_20_test_data.shape = }')

In [None]:
decision_tree_mv_80_20 = DecisionTreeClassifier().fit(mv_80_20_training_data, mv_80_20_training_target)
gauss_nb_mv_80_20 = GaussianNB().fit(mv_80_20_training_data, mv_80_20_training_target)
logistic_regression_mv_80_20 = LogisticRegression(max_iter=5000).fit(mv_80_20_training_data, mv_80_20_training_target)

In [None]:
mv_80_20_classifier = VotingClassifier(estimators=[('DT', decision_tree_mv_80_20), ('GNB', gauss_nb_mv_80_20), ('LR', logistic_regression_mv_80_20)], voting='hard')
mv_80_20_classifier = mv_80_20_classifier.fit(mv_80_20_training_data, mv_80_20_training_target)
mv_80_20_classifier_predict = mv_80_20_classifier.predict(mv_80_20_test_data)

ADA Boost 80%-20%

In [None]:
ada_80_20_train_set, ada_80_20_test_set = train_test_split(original_spam_df, test_size=0.20, random_state=99)

In [None]:
ada_80_20_training_data, ada_80_20_training_target = ada_80_20_train_set[non_class_features], ada_80_20_train_set['Class']
ada_80_20_test_data, ada_80_20_test_target = ada_80_20_test_set[non_class_features], ada_80_20_test_set['Class']

In [None]:
print(f'{ada_80_20_training_data.shape = }\n{ada_80_20_test_data.shape = }')

In [None]:
ada_boost_80_20_classifier = AdaBoostClassifier(base_estimator=decision_tree_classifier).fit(ada_80_20_training_data, ada_80_20_training_target)
ada_boost_80_20_classifier_predict = ada_boost_80_20_classifier.predict(ada_80_20_test_data)

#### 80%-20% split Comparision:
* Majority Voting [(1) Decision Tree, (2) Gaussian Naïve Bayes, and (3) Logistic Regression]
* ADA Boost [Decision Tree]

In [None]:
print("Majority Voting 80%-20% Split")
rate_classifier(mv_80_20_test_target, mv_80_20_classifier_predict)

In [None]:
print("ADA Boost 80%-20% Split")
rate_classifier(ada_80_20_test_target, ada_boost_80_20_classifier_predict)