In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
Fake_news = pd.read_csv("C:\\python\\news.csv")

In [3]:
dataset=Fake_news[['text','label']]

In [4]:
dataset['label'].value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

In [5]:
dataset.shape

(6335, 2)

In [6]:
dataset.head(20)

Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1
3,Videos 15 Civilians Killed In Single US Airstr...,1
4,Print \nAn Iranian woman has been sentenced to...,1
5,"In these trying times, Jackie Mason is the Voi...",0
6,Ever wonder how Britain’s most iconic pop pian...,1
7,"PARIS — France chose an idealistic, traditi...",0
8,Donald J. Trump is scheduled to make a highly ...,0
9,A week before Michael T. Flynn resigned as nat...,0


In [6]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [7]:
ps = WordNetLemmatizer()


In [8]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\BISWA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\BISWA\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [9]:
stopwords = stopwords.words('english')

In [10]:
def cleaning_data(row):
    
    # convert text to into lower case
    row = row.lower() 
    
    # this line of code only take words from text and remove number and special character using RegX
    row = re.sub('[^a-zA-Z]' , ' ' , row)
    
    # split the data and make token.
    token = row.split() 
    
    # lemmatize the word and remove stop words like a, an , the , is ,are ...
    news = [ps.lemmatize(word) for word in token if not word in stopwords]  
    
    # finaly join all the token with space
    cleanned_news = ' '.join(news) 
    
    # return cleanned data
    return cleanned_news 

In [11]:
dataset['text'] = dataset['text'].apply(lambda x : cleaning_data(X))

NameError: name 'X' is not defined

In [12]:
dataset.isnull().sum()

text     0
label    0
dtype: int64

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
vectorizer = TfidfVectorizer(max_features = 50000 , lowercase=False , ngram_range=(1,2))

In [15]:
dataset.shape

(6335, 2)

In [16]:
X = dataset.iloc[:35000,0]
y = dataset.iloc[:35000,1]

In [17]:
X.head()

0    Daniel Greenfield, a Shillman Journalism Fello...
1    Google Pinterest Digg Linkedin Reddit Stumbleu...
2    U.S. Secretary of State John F. Kerry said Mon...
3    — Kaydee King (@KaydeeKing) November 9, 2016 T...
4    It's primary day in New York and front-runners...
Name: text, dtype: object

In [18]:
y.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [19]:
from sklearn.model_selection import train_test_split
train_data , test_data , train_label , test_label = train_test_split(X , y , test_size = 0.2 ,random_state = 0)

In [20]:
dataset['text'] = dataset['text'].apply(lambda x : cleaning_data(X))

AttributeError: 'Series' object has no attribute 'lower'

In [21]:
vec_train_data = vectorizer.fit_transform(train_data.values.astype('U'))

In [22]:
vec_train_data = vec_train_data.toarray()

In [23]:
train_data.shape , test_data.shape

((5068,), (1267,))

In [24]:
vec_test_data = vectorizer.transform(test_data.values.astype('U')).toarray()

In [25]:
vec_train_data.shape , vec_test_data.shape

((5068, 50000), (1267, 50000))

In [26]:
train_label.value_counts() # balanced partition

FAKE    2549
REAL    2519
Name: label, dtype: int64

In [27]:
test_label.value_counts() # balanced partition

REAL    652
FAKE    615
Name: label, dtype: int64

In [28]:
training_data = pd.DataFrame(vec_train_data , columns=vectorizer.get_feature_names())
testing_data = pd.DataFrame(vec_test_data , columns= vectorizer.get_feature_names())



In [29]:
from sklearn.naive_bayes import MultinomialNB

In [30]:
from sklearn.metrics import accuracy_score,classification_report

In [31]:
clf = MultinomialNB()

In [32]:
clf.fit(training_data, train_label)
y_pred  = clf.predict(testing_data)

In [33]:
pd.Series(y_pred).value_counts()

REAL    727
FAKE    540
dtype: int64

In [34]:
test_label.value_counts()

REAL    652
FAKE    615
Name: label, dtype: int64

In [35]:
print(classification_report(test_label , y_pred))

              precision    recall  f1-score   support

        FAKE       0.94      0.82      0.88       615
        REAL       0.85      0.95      0.90       652

    accuracy                           0.89      1267
   macro avg       0.90      0.89      0.89      1267
weighted avg       0.89      0.89      0.89      1267



In [36]:
y_pred_train = clf.predict(training_data)
print(classification_report(train_label , y_pred_train))

              precision    recall  f1-score   support

        FAKE       0.97      0.92      0.94      2549
        REAL       0.92      0.97      0.95      2519

    accuracy                           0.94      5068
   macro avg       0.95      0.95      0.94      5068
weighted avg       0.95      0.94      0.94      5068



In [37]:
accuracy_score(train_label , y_pred_train)

0.9449486977111287

In [38]:
accuracy_score(test_label , y_pred)

0.8887134964483031

In [2]:
from sklearn.metrics import accuracy_score, confusion_matrix
#DataFlair - Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

NameError: name 'y_test' is not defined