In [29]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns

#Convert the CSV files to dataframes. 
data= pd.read_csv('lemmatized_articles.csv')
data2= pd.read_csv('BBC_train_full.csv')

#Gets the tokenized data
x = data['tokens']
#Gets the labels
y = data2['category'].values

#This converts the tokens to a vector so it can be converted to an array later to be read by the bot
vectorizer = CountVectorizer()
x_transformed = vectorizer.fit_transform(x)
joblib.dump(vectorizer, 'vectorizer.joblib')

#Sets the random seed. Some potential numbers to test: 40, 52, 53, 605, 6010, 
seed = 41

#Splits the data into 20% test and 80% train. Random state is the seed used for the randomization.
x_train, x_test, y_train, y_test = train_test_split(x_transformed, y, test_size=0.2, random_state=seed)

#Further splits the 80% data into two sets of 40%. This is so we can train the two bots with the same data set.
x_train_bot1, x_train_bot2, y_train_bot1, y_train_bot2 = train_test_split(x_train, y_train, test_size=0.5, random_state=seed)

#Creates the bots
NBBot = GaussianNB()
RFBot = RandomForestClassifier(random_state=seed)


#This is where we 'train' the bots, by giving them things to reference
NBBot.fit(x_train_bot1.toarray(), y_train_bot1)
RFBot.fit(x_train_bot2, y_train_bot2)

#This stores the bots predictions. We also convert x_test to an array so it can be read correctly
NB_predictions = NBBot.predict(x_test.toarray())
RF_predictions= RFBot.predict(x_test)

#We get the accuray score by compparing the predictions to the labels
accuracyNB = accuracy_score(y_test, NB_predictions)
accuracyRF = accuracy_score(y_test, RF_predictions)

#Saves the bots for reuse later
joblib.dump(NBBot, 'NBBotML.joblib')
joblib.dump(RFBot, 'RFBotML.joblib')


# Load the model
#loaded_bot = joblib.load('NBBot.joblib')

# Use the loaded model for predictions
#loaded_predictions = loaded_bot.predict(x_test.toarray())


#Classification report
#This report provides detailed metrics about the model's performance
#It includes precision, recall, F1-score, and support for each class
print("\nClassification Report NB:")
#Generate the report by comparing true labels ('y_test') with predicted labels ('bot_predictions')
print(classification_report(y_test, NB_predictions))
print("\nClassification Report RF:")
print(classification_report(y_test, RF_predictions))


'''
#Creating a per-category accuracy report
def per_category_accuracy(y_true, y_pred):
    #Extract unique categories form the true labels and sort them
    categories = sorted(set(y_true))
    #Initialize a dictionary to hold accuracy metrics for each category
    accuracy_per_category = {}
    
    #Loop over each unique category to evaluate its perormance
    for category in categories:
        #Count the total number of actual instances of this category in the true labels
        actual_count = (y_true == category).sum() 
        #Count the number of instances correctly predicted as this category
        correct_count = ((y_true == category) & (y_pred == category)).sum()
        
        #Calculate the accuracy for the current category
        #Avoid division by zero by checking if actual_count os greater than zero
        accuracy = correct_count / actual_count if actual_count > 0 else 0
       #Store the results in the dictionary with detailed metrics
        accuracy_per_category[category] = {
            'Total': actual_count,
            'Correct': correct_count,
            'Accuracy': accuracy
        }
    #Convert the accuracy dictionary into a Pandas DataFrame for better display
    return pd.DataFrame(accuracy_per_category).T

#Call the function to generate the per-category accuracy report using the true and predicted labels
category_accuracy_report = per_category_accuracy(y_test, bot_predictions)

#Print the per-category accuracy report to analyze the model performance on each class
print("\nPer-Category Accuracy Report:")
print(category_accuracy_report)

#Visualization: Create a confusion Matrix to viaualize the performance of theclassification model
#The confusion matrix summarizes the count of correct and incorrect predictions, which helps identify areas of improvement
confusion_mat = confusion_matrix(y_test, bot_predictions)

#Set up the visual parameters for the confusion matrix plot
plt.figure(figsize=(10, 7))

#Get and sort unique categories for label display in the confusion matrix
unique_categories = sorted(set(y))

#Create a heatmap to represent the confusion matrix visually
#Use annotations to display the counts in each cell with a blue color palette
sns.heatmap(confusion_mat, annot=True, fmt='d', cmap='Blues', xticklabels=sorted(set(y)), yticklabels=sorted(set(y)))

#Label the axes to clarify what is being represented in the confusion matrix
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()
'''




Classification Report NB:
               precision    recall  f1-score   support

     business       0.89      0.89      0.89        84
entertainment       0.85      0.88      0.87        60
     politics       0.89      0.89      0.89        65
        sport       1.00      0.94      0.97        69
         tech       0.89      0.91      0.90        68

     accuracy                           0.90       346
    macro avg       0.91      0.90      0.90       346
 weighted avg       0.91      0.90      0.91       346


Classification Report RF:
               precision    recall  f1-score   support

     business       0.86      0.99      0.92        84
entertainment       1.00      0.85      0.92        60
     politics       0.97      0.98      0.98        65
        sport       0.96      0.99      0.97        69
         tech       0.97      0.88      0.92        68

     accuracy                           0.94       346
    macro avg       0.95      0.94      0.94       346
 weigh

'\n#Creating a per-category accuracy report\ndef per_category_accuracy(y_true, y_pred):\n    #Extract unique categories form the true labels and sort them\n    categories = sorted(set(y_true))\n    #Initialize a dictionary to hold accuracy metrics for each category\n    accuracy_per_category = {}\n    \n    #Loop over each unique category to evaluate its perormance\n    for category in categories:\n        #Count the total number of actual instances of this category in the true labels\n        actual_count = (y_true == category).sum() \n        #Count the number of instances correctly predicted as this category\n        correct_count = ((y_true == category) & (y_pred == category)).sum()\n        \n        #Calculate the accuracy for the current category\n        #Avoid division by zero by checking if actual_count os greater than zero\n        accuracy = correct_count / actual_count if actual_count > 0 else 0\n       #Store the results in the dictionary with detailed metrics\n        ac