In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
#mount the drive.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#naviagte to directory where clean csv is
%cd /content/drive/My Drive/Group8-Project4/Resources/

/content/drive/My Drive/Group8-Project4/Resources


In [4]:
%ls

fake-news-data.zip  FakeNews_Processed_Data.csv  my_data.h5  WELFake_Dataset.csv


In [5]:
# defined path to csv
df = pd.read_csv('FakeNews_Processed_Data.csv')

# display the first few rows of the DataFrame
df.head(10)

Unnamed: 0,title_text,label
0,law enforcement high alert following threats c...,1
1,post votes hillary already,1
2,unbelievable obamas attorney general says char...,1
3,bobby jindal raised hindu uses story christian...,0
4,satan russia unvelis image terrifying new supe...,1
5,time christian group sues amazon splc designat...,1
6,dr ben carson targeted irs never audit spoke n...,1
7,house intel chair trumprussia fake story evide...,1
8,sports bar owner bans nfl gameswill show true ...,1
9,latest pipeline leak underscores dangers dakot...,1


In [6]:
# Print the data type of each entry in 'title_text'
print(df['title_text'].apply(type).value_counts())


<class 'str'>      72133
<class 'float'>        1
Name: title_text, dtype: int64


In [8]:
# Convert entire 'title_text' column to strings
df['title_text'] = df['title_text'].astype(str)



## **Word Embedding**
We will use Word2Vec for word embedding. We will use a pre-trained model to vectorise our data to start with because these models are trained on a massive corpus and can capture a wide range of semantic relationships. We will access the Google Word2Vec model.

In [9]:
!pip install gensim




In [10]:
# Word2Vec expects a list of tokenized sentences, where each sentence is a list of words
#this code is tokenizing the text

from nltk.tokenize import word_tokenize
nltk.download('punkt')  # Download the tokenizer model

tokenized_data = [word_tokenize(sentence) for sentence in df['title_text']]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
%cd /content/drive/My Drive/Group8-Project4


/content/drive/My Drive/Group8-Project4


In [None]:
%ls

01_Data_Processing_Team8_Prj4.ipynb             document_vectors.h5
02_fake_news_natural_language_processing.ipynb  [0m[01;34mResources[0m/


In [None]:
# Adjust the path according to your Google Drive structure
tokenized_data_path = '/content/drive/My Drive/Group8-Project4/tokenized_data.pkl'


In [None]:
import pickle

# Saving the tokenized_data
with open(tokenized_data_path, 'wb') as file:
    pickle.dump(tokenized_data, file)


In [11]:
#accessing the saved tokenized_data
tokenized_data_path = '/content/drive/My Drive/Group8-Project4/tokenized_data.pkl'


In [12]:
import pickle

# Load the tokenized data
with open(tokenized_data_path, 'rb') as file:
    loaded_tokenized_data = pickle.load(file)


In [13]:
#check that the loading was succesfull

print(loaded_tokenized_data[:2])  # print first two tokenized documents
print(len(loaded_tokenized_data)) # print the number of documents


[['law', 'enforcement', 'high', 'alert', 'following', 'threats', 'cops', 'whites', 'blacklivesmatter', 'fyf', 'terrorists', 'videono', 'comment', 'expected', 'barack', 'obama', 'members', 'fyf', 'fukyoflag', 'blacklivesmatter', 'movements', 'called', 'lynching', 'hanging', 'white', 'people', 'cops', 'encouraged', 'others', 'radio', 'show', 'tuesday', 'night', 'turn', 'tide', 'kill', 'white', 'people', 'cops', 'send', 'message', 'killing', 'black', 'people', 'americaone', 'fyoflag', 'organizers', 'called', 'sunshine', 'radio', 'blog', 'show', 'hosted', 'texas', 'called', 'sunshine', 'fing', 'opinion', 'radio', 'show', 'snapshot', 'fyf', 'lolatwhitefear', 'twitter', 'page', 'pm', 'shows', 'urging', 'supporters', 'call', 'fyf', 'tonight', 'continue', 'dismantle', 'illusion', 'white', 'snapshot', 'twitter', 'radio', 'call', 'invite', 'fyfthe', 'radio', 'show', 'aired', 'pm', 'eastern', 'standard', 'timeduring', 'show', 'callers', 'clearly', 'call', 'lynching', 'killing', 'white', 'peoplea'

In [14]:
#using the google pre-trained word2Vec model
import gensim.downloader as api

# Load Google's pre-trained Word2Vec model.
w2v_model = api.load('word2vec-google-news-300')




In [15]:
import numpy as np

def document_vector(word_list, model):
    # remove out-of-vocabulary words and get their vectors
    word_vectors = [model[word] for word in word_list if word in model.key_to_index]

    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)  # return zero vector if no words in the model
    else:
        # calculate the mean of word vectors to represent the document vector
        return np.mean(word_vectors, axis=0)


In [16]:
# Vectorize each document in the tokenized data
X = np.array([document_vector(doc, w2v_model) for doc in tokenized_data])


In [17]:
print(X.shape)

(72134, 300)


In [18]:
# Print the first vector to inspect
print(X[0])


[ 2.81908438e-02  2.75723021e-02  3.58001590e-02  6.53967410e-02
 -5.46097308e-02 -2.37619095e-02 -4.09275247e-03 -8.24435055e-02
  6.89467937e-02  8.02911073e-02 -2.15530396e-02 -1.22802809e-01
 -3.96617614e-02  5.43146245e-02 -9.78071019e-02  7.72967935e-02
 -1.07843494e-02  8.95663351e-02 -3.19761001e-02 -7.24404454e-02
  1.83907095e-02  3.65553647e-02  3.63810174e-02 -3.08289602e-02
  5.26063889e-03 -1.12113245e-02 -8.50686207e-02  4.42833193e-02
  4.32750955e-02 -6.67038513e-03  8.79115239e-03  1.32957762e-02
 -3.93132754e-02 -2.35816129e-02 -1.39553228e-03  1.34147163e-02
  5.09627163e-02  1.21055013e-02  2.41050366e-02  6.83943406e-02
  7.42554888e-02 -6.91036358e-02  1.33677557e-01 -1.94583535e-02
 -2.85491417e-03 -5.00720553e-02 -6.66472167e-02 -4.27507348e-02
 -2.42126845e-02  4.17479239e-02 -2.60889009e-02  3.14749368e-02
  2.81617939e-02 -8.21636000e-04  1.62940919e-02 -7.99699279e-04
 -6.70565292e-02 -5.35239205e-02  9.20308195e-03 -7.64516741e-02
 -1.03444858e-02  7.03349

In [19]:
%ls

fake-news-data.zip  FakeNews_Processed_Data.csv  my_data.h5  WELFake_Dataset.csv


In [20]:
%cd  /content/drive/My Drive/Group8-Project4

/content/drive/My Drive/Group8-Project4


In [21]:
%ls

01_Data_Processing_Team8_Prj4.ipynb             [0m[01;34mResources[0m/
02_fake_news_natural_language_processing.ipynb  tokenized_data.pkl


In [22]:
# Define the file path for the numpy file
npy_path = '/content/drive/My Drive/Group8-Project4/vectorized_data.npy'


In [23]:
# Save the array to a .npy file
np.save(npy_path, X)
