In [19]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns

#Convert the CSV files to dataframes. 
data= pd.read_csv('lemmatized_articles.csv')
data2= pd.read_csv('BBC_train_full.csv')
data3= pd.read_csv('predicted_labels.csv')

#Gets the tokenized data
x = data['tokens']
#Gets the labels
y = data2['category'].values

combined_one = pd.DataFrame({'category': y, 'tokens': x})

df_combined = pd.concat([combined_one, data3], ignore_index=True)

df_combined.to_csv('CombinedOriginalWithGenerated.csv', index=False)

#This converts the tokens to a vector so it can be converted to an array later to be read by the bot
vectorizer = CountVectorizer()
x_transformed = vectorizer.fit_transform(df_combined['tokens'])

#Sets the random seed. Some potential numbers to test: 40, 52, 53, 605, 6010, 
seed = 6010

#Splits the data into 20% test and 80% train. Random state is the seed used for the randomization.
x_train, x_test, y_train, y_test = train_test_split(x_transformed, df_combined['category'], test_size=0.2, random_state=seed)

#Further splits the 80% data into two sets of 40%. This is so we can train the two bots with the same data set.
x_train_bot1, x_train_bot2, y_train_bot1, y_train_bot2 = train_test_split(x_train, y_train, test_size=0.5, random_state=seed)


#Creates the bots
NBBot = GaussianNB()
RFBot = RandomForestClassifier(random_state=seed)

#This is where we 'train' the bots, by giving them things to reference
NBBot.fit(x_train_bot1.toarray(), y_train_bot1)
RFBot.fit(x_train_bot2, y_train_bot2)

#This stores the bots predictions. We also convert x_test to an array so it can be read correctly
NB_predictions = NBBot.predict(x_test.toarray())
RF_predictions= RFBot.predict(x_test)

#We get the accuray score by compparing the predictions to the labels
accuracyNB = accuracy_score(y_test, NB_predictions)
accuracyRF = accuracy_score(y_test, RF_predictions)

print("\nClassification Report NB:")
#Generate the report by comparing true labels ('y_test') with predicted labels ('bot_predictions')
print(classification_report(y_test, NB_predictions))
print("\nClassification Report RF:")
print(classification_report(y_test, RF_predictions))



Classification Report NB:
               precision    recall  f1-score   support

     business       0.87      0.85      0.86        95
entertainment       0.93      0.83      0.88        82
     politics       0.94      0.93      0.93        97
        sport       0.99      0.93      0.96       100
         tech       0.76      0.96      0.85        71

     accuracy                           0.90       445
    macro avg       0.90      0.90      0.90       445
 weighted avg       0.91      0.90      0.90       445


Classification Report RF:
               precision    recall  f1-score   support

     business       0.90      0.99      0.94        95
entertainment       0.97      0.94      0.96        82
     politics       0.98      0.97      0.97        97
        sport       0.97      0.98      0.98       100
         tech       0.98      0.90      0.94        71

     accuracy                           0.96       445
    macro avg       0.96      0.96      0.96       445
 weigh