In [None]:
# Connecting drive 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# importing necessary modules
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Read in dataset
data = pd.read_csv("/content/drive/MyDrive/Fake News Data/Dataset/Final_Clean.csv")

In [None]:
data.head

<bound method NDFrame.head of        Unnamed: 0  Unnamed: 0.1              author  \
0               0             0      Louis Jacobson   
1               1             1          D.L. Davis   
2               2             2         Yacob Reyes   
3               3             3  Samantha Putterman   
4               4             4       Maria Ramirez   
...           ...           ...                 ...   
17580       17580          5935  Samantha Putterman   
17581       17581          5936          Eric Litke   
17582       17582          5937      Ciara O'Rourke   
17583       17583          5938  Samantha Putterman   
17584       17584          5939      Ciara O'Rourke   

                                               statement  \
0      “We created more new jobs in two years than an...   
1      "During my time in office, we've increased per...   
2      "(Ron) DeSantis' bill would remove: background...   
3      “Female student-athletes in Florida need to pr...   
4      "U

In [None]:
# checking for null values
data.isnull().sum()

Unnamed: 0            0
Unnamed: 0.1          0
author                0
statement            94
article            3071
source              210
date                  0
target              210
BinaryTarget          0
BinaryNumTarget       0
dtype: int64

In [None]:
# combining all text to save in content column
data['content'] = data['author'] + ' ' + data['statement'] + ' ' + data['article']

# **Lemmatization**

In [None]:
# loading nltk lemmatizer
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

In [None]:
# function to lemmatize
def lemmatization(content):
    lemmatized_content = re.sub('[^a-zA-Z]',' ',content)  # removing symbols 
    lemmatized_content = lemmatized_content.lower()       # converting to lowercase
    lemmatized_content = lemmatized_content.split()       # splitting words and removing stopwords
    lemmatized_content = [lemmatizer.lemmatize(word) for word in lemmatized_content if not word in stopwords.words('english')]
    lemmatized_content = ' '.join(lemmatized_content)     # joining words again
    return lemmatized_content

In [None]:
# downloading necessary modules
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
# applying lemmatization on content
data['content'] = data['content'].apply(lemmatization)

In [None]:
# saving the dataframe for future use
data.to_csv('/content/drive/MyDrive/Fake News Data/Dataset/Final_Processed.csv')

# **Vectorization using Bag of Words**

In [None]:
# Obtain the total words present in the dataset
list_of_words = []
for i in data.content:
    for j in i:
        list_of_words.append(j)
len(list_of_words)

4847349

In [None]:
# Obtain the total number of unique words (using set())
total_words = len(list(set(list_of_words)))
total_words

47905

In [None]:
# length of maximum document will be needed to create word embeddings 
nltk.download('punkt')
maxlen = -1
for doc in data.content:
    tokens = nltk.word_tokenize(doc)
    if(maxlen<len(tokens)):
        maxlen = len(tokens)
print("The maximum number of words in any document is =", maxlen)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


The maximum number of words in any document is = 1083


In [None]:
# Create a tokenizer to tokenize the words and create sequences of tokenized words
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer

bagOfWords = Tokenizer(num_words = total_words)
bagOfWords.fit_on_texts(data['content'])
bagOfWords_sequences = bagOfWords.texts_to_sequences(data['content'])

In [None]:
# Add padding can either be maxlen or smaller number maxlen = 40 seems to work well based on results
# Make sure all different samples have the same length (fillted 0 s for missing)
from tensorflow.keras.preprocessing.sequence import pad_sequences

bagOfWords_padded = pad_sequences(bagOfWords_sequences,maxlen = 40, padding = 'post', truncating = 'post')

In [None]:
# saving model
import pickle
pickle.dump(bagOfWords, open('/content/drive/MyDrive/Fake News Data/Models/BagOfWords.pkl', 'wb'))

# **Vectorization using Tfidf vectorizer**

In [None]:
dataset = []
for i in range(0, len(data)):
    review = re.sub('[^a-zA-Z]', ' ', data['content'][i])
    review = review.lower()
    review = review.split()
    
    review = ' '.join(review)
    dataset.append(review)

In [None]:
# TFidf Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_v = TfidfVectorizer(max_features=5000, ngram_range=(1,3))

In [None]:
X = tfidf_v.fit_transform(dataset).toarray()
y = data['BinaryNumTarget']

In [None]:
# saving model
import pickle
pickle.dump(tfidf_v, open('/content/drive/MyDrive/Fake News Data/Models/tfidf_v.pkl', 'wb'))

# **Vectorization using Word2Vec**

In [None]:
import gensim

In [None]:
y = data["BinaryNumTarget"].values
#Converting X to format acceptable by gensim, removing annd punctuation stopwords in the process
X = []
stop_words = set(nltk.corpus.stopwords.words("english"))
regexTokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
for par in data["content"].values:
    tmp = []
    sentences = nltk.sent_tokenize(par)
    for sent in sentences:
        sent = sent.lower()
        tokens = regexTokenizer.tokenize(sent)
        filtered_words = [w.strip() for w in tokens if len(w) > 1]
        tmp.extend(filtered_words)
    X.append(tmp)

In [None]:
#Dimension of vectors we are generating
EMBEDDING_DIM = 200

#Creating Word Vectors by Word2Vec Method (takes time...)
w2v_model = gensim.models.Word2Vec(sentences=X, size=EMBEDDING_DIM, window=10, min_count=1)

In [None]:
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer

# Tokenizing Text -> Repsesenting each word by a number
w2v_tokenizer = Tokenizer()
w2v_tokenizer.fit_on_texts(X)

In [None]:
# saving model
import pickle
pickle.dump(w2v_model, open('/content/drive/MyDrive/Fake News Data/Models/w2v_model.pkl', 'wb'))
pickle.dump(w2v_tokenizer, open('/content/drive/MyDrive/Fake News Data/Models/w2v_tokenizer.pkl', 'wb'))