In [None]:
import pandas as pd
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from dotenv import load_dotenv

In [None]:
tf.config.threading.set_intra_op_parallelism_threads(4)

In [None]:
load_dotenv()

In [None]:
#Constants
MAXLEN = 250
MAX_NB_WORDS = 50_000
STOPWORDS = set(stopwords.words('english'))
# Types: Misinformation, Credible, Biased/Political, Unreliable/Caution
NEWS_CLASS_MAPPING = {
    "misinformation": 0,
    "credible": 1,
    "political_bias": 2,
    "unreliable": 3,
}

In [None]:
def preprocess_text(text):
    # text = BeautifulSoup(text, "html.parser").get_text()
    try:
        if text is None:
            return ""
        if text == "":
            return ""
        if not isinstance(text, str):
            return ""

        # Combine regex for URLs, mentions, and non-alphabetic characters
        text = re.sub(r"http\S+|@\w+|[^a-zA-Z\s]", " ", text)

        # Remove new lines, tabs, and extra spaces
        text = re.sub(r"[\n\t]+", " ", text)  # Replace new lines and tabs with a single space
        text = re.sub(r"  +", " ", text).strip()  # Replace multiple spaces with a single space

        # Convert to lowercase
        text = text.lower()

        # Tokenize and remove stopwords
        words = text.split()
        words = [w for w in words if w not in STOPWORDS]

        # Join words back into a single string
        text = " ".join(words)
        return text
    except Exception as e:
        print(f"Error processing text: {e}")
        return ""

In [None]:
def tokenize_text(X_t, X_v):
    # Tokenize the text data
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True)
    tokenizer.fit_on_texts(X_t)

    # Convert text to sequences
    X_train_seq = tokenizer.texts_to_sequences(X_t)
    X_val_seq = tokenizer.texts_to_sequences(X_v)

    X_train_pad = pad_sequences(X_train_seq, maxlen=MAXLEN, padding='post')
    X_val_pad = pad_sequences(X_val_seq, maxlen=MAXLEN, padding='post')

    # Vocabulary size
    vocab_len = len(tokenizer.word_index) + 1
    print(f"Vocabulary size: {vocab_len}")

    return X_train_pad, X_val_pad, vocab_len, tokenizer

In [None]:
NEWS_DATASET = os.getenv("NEWS_DATASET")

In [None]:
data = pd.read_csv(NEWS_DATASET, usecols=["text", "type"], nrows=1_000_000)
x = data["text"]
y = data["type"]
del data
x_t, x_v, y_t, y_v = train_test_split(x, y, test_size=0.2)
del x, y
y_t = y_t.map(NEWS_CLASS_MAPPING)
y_v = y_v.map(NEWS_CLASS_MAPPING)

X_train_padded, X_val_padded, vocab_size, tokenizer = tokenize_text(x_t, x_v)

with open(f"tokenizer.pickle", "wb") as f:
    pickle.dump(tokenizer, f)

In [None]:
def create_random_forest_classifier():
    from sklearn.ensemble import RandomForestClassifier
    return RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
rf = create_random_forest_classifier()
rf.fit(X_train_padded, y_t)
y_pred = rf.predict(X_val_padded)
cm = confusion_matrix(y_v, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=NEWS_CLASS_MAPPING.keys(),
            yticklabels=NEWS_CLASS_MAPPING.keys())
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

accuracy = accuracy_score(y_v, y_pred)
print(f"Accuracy: {accuracy:.4f}")
report = classification_report(y_v, y_pred, target_names=list(NEWS_CLASS_MAPPING.keys()))
print(report)

#save model
with open(f"random_forest.pickle", "wb") as f:
    pickle.dump(rf, f)