In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

sns.set_theme()


In [None]:
# Load CNN news dataset
df = pd.read_csv("news.csv")
df.__len__()


In [None]:
# Trim database to keep relevant columns
keep_columns = ["Title", "Description", "Body", "Theme"]
df = df[keep_columns]

# Drop rows with NULL values
df = df.dropna()

df.__len__()


In [None]:
# Accumulate title, description, and body under one column
SPACE = " "
df["Features"] = df["Title"] + SPACE + df["Description"] + SPACE + df["Body"]
df["Label"] = df["Theme"]


In [None]:
# Split the data into training and testing sets
X = df["Features"]
y = df["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Create a TF-IDF vectorizer and transform the text data
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [None]:
# Train SVM
svm_classifier = SVC(C=1.0, kernel="linear", gamma="scale", probability=True, random_state=21)
svm_classifier.fit(X_train_tfidf, y_train)


In [None]:
# Train naive bayes
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tfidf, y_train)


In [None]:
# Train decision tree
decision_tree_classifier = DecisionTreeClassifier(random_state=21, max_depth=4)
decision_tree_classifier.fit(X_train_tfidf, y_train)


In [None]:
# Predict probabilities for each classifier
svm_probs = svm_classifier.predict_proba(X_test_tfidf)
nb_probs = naive_bayes_classifier.predict_proba(X_test_tfidf)
dt_probs = decision_tree_classifier.predict_proba(X_test_tfidf)


In [None]:
# Predict with individual classifiers
svm_predictions = svm_classifier.predict(X_test_tfidf)
nb_predictions = naive_bayes_classifier.predict(X_test_tfidf)
dt_predictions = decision_tree_classifier.predict(X_test_tfidf)


In [None]:
# SVM report
svm_report = classification_report(y_test, svm_predictions)
print(svm_report)


In [None]:
# Naive Bayes report
nb_report = classification_report(y_test, nb_predictions)
print(nb_report)


In [None]:
# Decision tree report
dt_report = classification_report(y_test, dt_predictions)
print(dt_report)


In [None]:
# Setup 1x3 subplot
fig, axis = plt.subplots(1, 3, figsize=(10, 4), dpi=300)
for i in range(3):
    axis[i].tick_params(axis="x", rotation=90)
    axis[i].set_ylim([0, 1])

axis[0].set_title("Support Vector Machine")
sns.barplot(y=svm_probs[0], x=svm_classifier.classes_, ax=axis[0])

axis[1].set_title("Naive Bayes")
sns.barplot(y=nb_probs[0], x=naive_bayes_classifier.classes_, ax=axis[1])

axis[2].set_title("Decision Tree")
sns.barplot(y=dt_probs[0], x=decision_tree_classifier.classes_, ax=axis[2])


In [None]:
ensemble_predictions = []
for i in range(len(X_test)):
    max_prob = max(svm_probs[i].max(), nb_probs[i].max(), dt_probs[i].max())
    _max = 0

    if max_prob == svm_probs[i].max():
        _max = svm_classifier.classes_[np.argmax(svm_probs[i])]
    elif max_prob == nb_probs[i].max():
        _max = naive_bayes_classifier.classes_[np.argmax(nb_probs[i])]
    else:
        _max = decision_tree_classifier.classes_[np.argmax(dt_probs[i])]

    ensemble_predictions.append(_max)


In [None]:
# Ensemble learning report
ensemble_report = classification_report(y_test, ensemble_predictions)
print(ensemble_report)


In [None]:
# Evaluate classifiers
svm_accuracy = accuracy_score(y_test, svm_predictions)
nb_accuracy = accuracy_score(y_test, nb_predictions)
dt_accuracy = accuracy_score(y_test, dt_predictions)
ensemble_accuracy = accuracy_score(y_test, ensemble_predictions)


In [None]:
# Create a bar chart to plot the accuracies
labels = ["SVM", "Naive Bayes", "Decision Tree", "Ensemble"]
accuracies = [svm_accuracy, nb_accuracy, dt_accuracy, ensemble_accuracy]
accuracies = [100 * x for x in accuracies]

fig, ax = plt.subplots(1, 1, figsize=(10, 4), dpi=300)
ax = sns.barplot(y=accuracies, x=labels)
ax.set_title("Accuracy of Models")
ax.bar_label(ax.containers[0], fontsize=10)
