In [1]:
import numpy as np
import pandas as pd

In [2]:
news = pd.read_csv('real_fake_news.csv')


In [3]:
news.head(44871)

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,fake
1,House Intelligence Committee Chairman Devin Nu...,fake
2,"On Friday, it was revealed that former Milwauk...",fake
3,"On Christmas day, Donald Trump announced that ...",fake
4,Pope Francis used his annual Christmas Day mes...,fake
...,...,...
44866,GENEVA (Reuters) - North Korea and the United ...,Real
44867,"SAO PAULO (Reuters) - Cesar Mata Pires, the ow...",Real
44868,GENEVA (Reuters) - North Korea and the United ...,Real
44869,GENEVA (Reuters) - North Korea and the United ...,Real


In [4]:
news.shape

(44878, 2)

# Check the categories data on Label columns

In [5]:
news.label.value_counts()

fake     23459
Real     21417
label        1
Name: label, dtype: int64

# Data Cleaning: Handle NA values

In [6]:
news.isnull().sum()

text     0
label    1
dtype: int64

In [7]:
news.shape

(44878, 2)

In [8]:
news_df = news.dropna()
news_df.isnull().sum()

text     0
label    0
dtype: int64

In [9]:
news_df.shape

(44877, 2)

In [10]:
news_df['lable_num'] = news_df.label.map({
    "fake" : 0,
    "Real" : 1
    
})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_df['lable_num'] = news_df.label.map({


In [11]:
news_df.head(44871)

Unnamed: 0,text,label,lable_num
0,Donald Trump just couldn t wish all Americans ...,fake,0.0
1,House Intelligence Committee Chairman Devin Nu...,fake,0.0
2,"On Friday, it was revealed that former Milwauk...",fake,0.0
3,"On Christmas day, Donald Trump announced that ...",fake,0.0
4,Pope Francis used his annual Christmas Day mes...,fake,0.0
...,...,...,...
44866,GENEVA (Reuters) - North Korea and the United ...,Real,1.0
44867,"SAO PAULO (Reuters) - Cesar Mata Pires, the ow...",Real,1.0
44868,GENEVA (Reuters) - North Korea and the United ...,Real,1.0
44869,GENEVA (Reuters) - North Korea and the United ...,Real,1.0


In [12]:
news_df['lable_num'].round(2)

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
44872    1.0
44873    1.0
44874    1.0
44875    1.0
44876    1.0
Name: lable_num, Length: 44877, dtype: float64

In [13]:
news_df.dtypes['lable_num']

dtype('float64')

# Convert dataype of coulmn from Float into Int

In [14]:
news_df['lable_num'] = news_df['lable_num'].fillna(0).astype(int)
print(news_df)
print(news_df.dtypes)

                                                    text label  lable_num
0      Donald Trump just couldn t wish all Americans ...  fake          0
1      House Intelligence Committee Chairman Devin Nu...  fake          0
2      On Friday, it was revealed that former Milwauk...  fake          0
3      On Christmas day, Donald Trump announced that ...  fake          0
4      Pope Francis used his annual Christmas Day mes...  fake          0
...                                                  ...   ...        ...
44872  BRUSSELS (Reuters) - NATO allies on Tuesday we...  Real          1
44873  LONDON (Reuters) - LexisNexis, a provider of l...  Real          1
44874  MINSK (Reuters) - In the shadow of disused Sov...  Real          1
44875  MOSCOW (Reuters) - Vatican Secretary of State ...  Real          1
44876  JAKARTA (Reuters) - Indonesia will buy 11 Sukh...  Real          1

[44877 rows x 3 columns]
text         object
label        object
lable_num     int32
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_df['lable_num'] = news_df['lable_num'].fillna(0).astype(int)


# 1. PreProcessing 
#           - Remove stop Words
#          - lemmitization
# 2. vectorization

In [15]:
import spacy

In [16]:
# load spacy large model
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
     -------------------------------------- 587.7/587.7 MB 1.0 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [17]:
import spacy 
nlp = spacy.load("en_core_web_lg")

In [18]:
def preprocess_and_vectorized(text):
    """create nlp object"""
    doc = nlp(text)
    
    filtered_tokens= []
    for token in doc:
        if token.is_punct or token.is_stop:
            continue
        filtered_tokens.append(token.lemma_)
            
    return filtered_tokens

In [19]:
preprocess_and_vectorized("hello i am very upset becuase i am going to far but happy that i got scholarshi")

['hello', 'upset', 'becuase', 'go', 'far', 'happy', 'get', 'scholarshi']

In [20]:
preprocess_and_vectorized("Don't worry if you don't understand")

['worry', 'understand']

In [21]:
import numpy as np

def preprocess_and_vectorize(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

In [22]:
import gensim

In [23]:
text1 = news_df.text.apply(gensim.utils.simple_preprocess)

In [24]:
text1

0        [donald, trump, just, couldn, wish, all, ameri...
1        [house, intelligence, committee, chairman, dev...
2        [on, friday, it, was, revealed, that, former, ...
3        [on, christmas, day, donald, trump, announced,...
4        [pope, francis, used, his, annual, christmas, ...
                               ...                        
44872    [brussels, reuters, nato, allies, on, tuesday,...
44873    [london, reuters, lexisnexis, provider, of, le...
44874    [minsk, reuters, in, the, shadow, of, disused,...
44875    [moscow, reuters, vatican, secretary, of, stat...
44876    [jakarta, reuters, indonesia, will, buy, sukho...
Name: text, Length: 44877, dtype: object

# Training the Word2Vec Model


Train the model for reviews. Use a window of size 10 i.e. 10 words before the present word and 10 words ahead. A sentence with at least 2 words should only be considered, configure this using min_count parameter.

Workers define how many CPU threads to be used.

# Initialize the model

In [25]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)

# Build Vocabulary

In [26]:
model.build_vocab(text1, progress_per=1000)

# Train the Word2Vec Model

In [27]:
model.train(text1, total_examples=model.corpus_count, epochs=model.epochs)

(69470001, 86881730)

# Finding Similar Words and Similarity between words

In [None]:
model.wv.most_similar("politics")

In [None]:
model.wv.similarity(w1="good", w2="great")

In [None]:
def preprocess_and_vectorized(text):
    """create nlp object"""
    doc = nlp(text)
    
    filtered_tokens= []
    for token in doc:
        if token.is_punct or token.is_stop:
            continue
        filtered_tokens.append(token.lemma_)
            
    return filtered_tokens

In [None]:
model.wv.get_mean_vector(["worry","understand"])