### CAP 6640 
### Project 2 - Fake News Detection
### Mar 28, 2024

### Group 4
### Andres Graterol
###                   UCF ID: 4031393
### Zachary Lyons
###                   UCF ID: 4226832
### Christopher Hinkle
###                   UCF ID: 4038573
### Nicolas Leocadio
###                   UCF ID: 3791733

#### Imports

In [None]:
import re 
import nltk
import gensim
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import tensorflow as tf
# Importing Libraries w.r.t Word Embedding layer & Lstm NN
from tensorflow.keras.layers import Embedding, Hashing
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

from tensorflow.keras.preprocessing.text import one_hot

from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

# Download necessary resources from nltk
try:
    nltk.data.find('stopwords')
except LookupError:
    nltk.download('stopwords')
    
try:
    nltk.data.find('punkt')
except LookupError:
    nltk.download('punkt')
    
try:
    nltk.data.find('wordnet')
except LookupError:
    nltk.download('wordnet') 

#### Step 1 - Data Collection

In [None]:
# Read in the training dataset as a dataframe
df = pd.read_csv('train.csv')

# Drop NaN rows from the dataframe to avoid errors
df = df.dropna(how='all', axis=0)

# Merge the author and title into a single column
df['article'] = df['author'] + ' ' + df['title']
# print(df['article'])

# print(df)

#### Step 2 - Data Preprocessing

In [None]:
df['title'][1]

In [None]:
# NOTE: Combine title and author into a single column, diverting from tutorial
# df = df.drop('title', axis=1)
# df = df.drop('author', axis=1)
df.head()

In [None]:
# Split data in into data (X) and labels (Y)

# Seclude the labels
# 1 - unreliable
# 0 - reliable
labels = df['label']
#print(labels)

# Get the data excluding the labels
data = df.drop('label', axis=1)
# print(data)

In [None]:
# TODO: This only lemmatises the article feature, not the text found in the article - figure out if we have to do this

# Init lemmatizer 
lemmatizer = WordNetLemmatizer()

cleaned_corpus = []


if os.path.isfile('cleaned_corpus.txt'):
    with open('cleaned_corpus.txt', 'r') as f:
        cleaned_corpus = [line.rstrip() for line in f]
else:
    with open('cleaned_corpus.txt', 'w') as f:
        # For each article (author + title) in the dataframe, clean the text
        for i in range(len(df)):
            # Remove any non-alphanumeric characters
            cleaned_article = re.sub('[^a-zA-Z0-9]', ' ', str(df['article'][i]))
            # Lowercase the text
            cleaned_article = cleaned_article.lower()
            # Split the text into individual words
            cleaned_article = cleaned_article.split()
            # Lemmatize the words and remove any stopwords
            cleaned_article = [lemmatizer.lemmatize(word) for word in cleaned_article if not word in set(stopwords.words('english'))]
            # Join the words back together
            cleaned_article = ' '.join(cleaned_article)
            cleaned_corpus.append(cleaned_article)

In [None]:
# Drop any lables associated with empty articles
labels = df[list(map(lambda x: len(x) > 0, cleaned_corpus))]
labels = labels['label']

In [None]:
# Remove these empty articles from the corpus 
cleaned_corpus = [i for i in cleaned_corpus if i]
len(cleaned_corpus)

#### Step 3 - Feature Engineering

In [None]:
# Size of the vocabulary - one hot encoding will assign the words with a number in range of 0 to vocab_size
vocab_size = 5000

# One hot encoding the cleaned corpus
onehot_repr=[one_hot(words, vocab_size) for words in cleaned_corpus]

# Finding the Max_len & Min_len of the sentences in the cleaned corpus
def get_min_max_lengths(cleaned_corpus):
    # Arbitrary values to compare against
    min_len = float('inf')
    max_len = float('-inf')

    # Iterate through the corpus to get the max and min length sentences
    for i in cleaned_corpus:
        if len(i.split()) > max_len:
            max_len = len(i.split())
        if len(i.split()) < min_len:
            min_len = len(i.split())
    return min_len, max_len 

min_len, max_len = get_min_max_lengths(cleaned_corpus)
print(min_len, max_len)