In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import nltk
import re
import gensim
from tqdm import tqdm
from nltk import word_tokenize , sent_tokenize
from nltk.stem import PorterStemmer , WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

<b>Dataset

In [3]:
# Read the Data
fakeDataset = pd.read_csv('fake.csv')
trueDataset = pd.read_csv('true.csv')

# Add the Column in Each Dataset (is_fake)
fakeDataset['isFake'] = 1
trueDataset['isFake'] = 0

# Display the Shape of the Fake & True Dataset
print('The Shape of the Fake News is : {}'.format(fakeDataset.shape))
print('The Shape of the True News is : {}'.format(trueDataset.shape))

The Shape of the Fake News is : (23481, 5)
The Shape of the True News is : (21417, 5)


In [4]:
# Concatenation the Dataset
dataset = pd.concat([fakeDataset,trueDataset])

# Display the Shape of the Dataset
print('The Shape of the Dataset is : {}'.format(dataset.shape))

The Shape of the Dataset is : (44898, 5)


In [5]:
# Now Get the usefull Columns and create the new Dataset
dataset = dataset[['text' , 'isFake']]

# Now Shuffle the Dataset
newDataset = shuffle(dataset)

# Display the Data from newDataset
newDataset.head()

Unnamed: 0,text,isFake
18234,Here s a question for the anti-Trump media: If...,1
533,WASHINGTON (Reuters) - A senior U.S. Consumer ...,0
16473,BRASILIA (Reuters) - Brazilian lawmakers rejec...,0
9765,WASHINGTON/NEW YORK (Reuters) - Activist inves...,0
15527,It s too early to endorse Hillary? What s Al G...,1


In [6]:
# Display the Shape of the Actuall Dataset
print('The Shape of the Actual Dataset is : {}'.format(newDataset.shape))

The Shape of the Actual Dataset is : (44898, 2)


In [7]:
# Check the missing values in the Dataste
nan_values = newDataset.isnull().sum()
print('The total Missing Values in the Dataset is : \n{}'.format(nan_values))

The total Missing Values in the Dataset is : 
text      0
isFake    0
dtype: int64


In [8]:
# Check the Duplicated Row in the Dataset
dup_row = newDataset.duplicated().sum()
print('The Total Duplicated Row in the Dataset is : {}'.format(dup_row))

The Total Duplicated Row in the Dataset is : 6251


In [9]:
# Now Drop the Duplicated Row from the Dataset
newDataset.drop_duplicates(inplace = True)

In [10]:
# Check the Balanced/Imbalaned Dataset
countValue = newDataset['isFake'].value_counts()
print(countValue)

0    21192
1    17455
Name: isFake, dtype: int64


<b>Text Preprocessing

In [11]:
# Convert the Dataset into Independent & Dependent Matrix 
X = newDataset.iloc[:,0].values
Y = newDataset.iloc[:,-1].values

# Display the Shape of the Matrixs
print('The Shape of the X Matrix is : {}'.format(X.shape))
print('The Shape of the Y Matrix is : {}'.format(Y.shape))

The Shape of the X Matrix is : (38647,)
The Shape of the Y Matrix is : (38647,)


In [12]:
# Create the Object of Porter Stemmer
stemmer = PorterStemmer()
Corpus = []

for index in range(0,len(X)):
    text = re.sub("[^a-zA-Z]" , ' ' , X[index])
    text = text.lower()
    text = word_tokenize(text)
    text = [stemmer.stem(word) for word in text if word not in set(stopwords.words('english'))]
    Corpus.append(text)

<b>Vectorizing the text Data (Word2Vec Scratch)

In [14]:
# Create the model of Word2Vec 
model = gensim.models.Word2Vec(
    window = 50,
    min_count = 3
)

In [15]:
# Build the Vocabulary 
model.build_vocab(Corpus)

In [16]:
# Train the model (Word2Vec)
model.train(Corpus , total_examples = model.corpus_count , epochs = model.epochs)

(43148314, 45036170)

<b>Word2Vec(Important Functions)

In [17]:
# Display the unique words
unqWord = model.wv.index_to_key
print(unqWord)

['trump', 'said', 'state', 'presid', 'u', 'would', 'republican', 'peopl', 'year', 'one', 'say', 'reuter', 'elect', 'govern', 'also', 'new', 'hous', 'donald', 'like', 'democrat', 'time', 'report', 'nation', 'clinton', 'call', 'obama', 'unit', 'countri', 'support', 'parti', 'campaign', 'senat', 'could', 'right', 'american', 'go', 'told', 'make', 'vote', 'white', 'offici', 'two', 'last', 'get', 'includ', 'use', 'want', 'polit', 'work', 'washington', 'law', 'offic', 'group', 'take', 'first', 'back', 'even', 'news', 'secur', 'former', 'day', 'week', 'court', 'imag', 'mani', 'plan', 'show', 'leader', 'attack', 'may', 'need', 'percent', 'think', 'administr', 'come', 'made', 'sinc', 'million', 'accord', 'month', 'know', 'hillari', 'tax', 'bill', 'ask', 'presidenti', 'issu', 'polic', 'us', 'russia', 'way', 'media', 'statement', 'investig', 'forc', 'twitter', 'rule', 'north', 'member', 'via', 'help', 'polici', 'meet', 'tri', 'talk', 'foreign', 'well', 'gener', 'public', 'america', 'feder', 'mili

In [18]:
# Display the Total Unique Words
print('Total Uniqne Words : {}'.format(len(unqWord)))

Total Uniqne Words : 36775


In [19]:
# Display the All Vectors in 2-D Numpy Array
model.wv.get_normed_vectors()

array([[-0.02248277, -0.20136856,  0.11395926, ..., -0.16163585,
         0.15804642,  0.02860002],
       [ 0.02386413, -0.0329942 ,  0.0983443 , ..., -0.00254588,
         0.03306786,  0.07372367],
       [ 0.16879506,  0.05249024, -0.2734328 , ..., -0.07736836,
         0.05807134, -0.08408707],
       ...,
       [ 0.07567289,  0.22353612, -0.03373948, ...,  0.08794161,
        -0.08898716, -0.07205143],
       [ 0.02879438,  0.17653883,  0.07005294, ..., -0.24067429,
        -0.01774124, -0.05123293],
       [-0.08412647,  0.13693875,  0.11989737, ..., -0.09382868,
         0.06003834, -0.13452382]], dtype=float32)

In [20]:
# Find out the most similar word of this word (state)
model.wv.similar_by_word('state')

[('nation', 0.5293363928794861),
 ('steelwork', 0.4744691252708435),
 ('thereto', 0.46948567032814026),
 ('kingdom', 0.468544065952301),
 ('emir', 0.4563678205013275),
 ('michaelfolk', 0.4296437203884125),
 ('ntpc', 0.41802430152893066),
 ('continent', 0.4147549569606781),
 ('vda', 0.4108944237232208),
 ('altgov', 0.4100116193294525)]

In [21]:
# find out the Similarity Between 2 Words
model.wv.similarity(w1 = 'state' , w2 = 'american')

0.08957322

In [22]:
# find out the Similarity Between 2 Words
model.wv.similarity(w1 = 'state' , w2 = 'nation')

0.52933633

<b>AvgWord2Vec (Convert Each Row Text to Vectors)

In [23]:
def avgWord2Vec(text):
    # AvgWord2Vec
    return np.mean([model.wv.get_vector(word) for word in text if word in model.wv.index_to_key] , axis = 0)

In [25]:
# Convert the Vectors of each row text
getVectors = []
for index in tqdm(range(0,len(Corpus))):
    getVectors.append(avgWord2Vec(Corpus[index]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|████████████████████████████████████████████████████████████████████████████| 38647/38647 [15:31<00:00, 41.49it/s]


In [49]:
for index in range(0,len(Corpus)):
    if (str(getVectors[index].shape) != '(100,)'):
        getVectors[index] = getVectors[index+1]

In [50]:
# Convert the List into Numpy Array
vectorArray = np.array(getVectors)

# Display the Shape of the Array
print('The Shape of the Array is : {}'.format(vectorArray.shape))

The Shape of the Array is : (38647, 100)


In [51]:
vectorArray.shape

(38647, 100)

In [52]:
# Display the Data of VectorArray
vectorArray.ndim

2

In [53]:
# Now Stack the Numpy Array into 2-D Numpy Array
vectorArray2D = np.stack(vectorArray)

# Now Display the Shape of the Numpy Array
print('The Shape of the Numpy Array : {}'.format(vectorArray2D.shape))

The Shape of the Numpy Array : (38647, 100)


<b>Split the Data for (Training & Testing)

In [54]:
# Convert Data into Training & Testing
x_train , x_test , y_train , y_test = train_test_split(vectorArray2D , Y , test_size = 0.25 , random_state = 42)

# Display the Shape of the Train & Test


<b>Machine Learning Model

In [55]:
# Create the Object of Logistic Regression
model = LogisticRegression()
model.fit(x_train , y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [57]:
# Predict the Result
prediction = model.predict(x_test)

# Find out the Accuracy of the Model
accModel = accuracy_score(y_test , prediction)

# Display the Accuracy of the Model
print('The Accuracy of the Model is : {}'.format(accModel))

The Accuracy of the Model is : 0.982612295590975
