In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk



In [3]:
dataset = pd.read_csv("news.csv")

In [4]:
dataset.label.value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

In [5]:
label_map = {
    'FAKE' : 0,
    "REAL" : 1
}
dataset["new_label"] = dataset["label"].map(label_map)

In [6]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,new_label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,0
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,0
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,1
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,0
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,1


In [7]:
X = dataset.iloc[:,1:3].values
y = dataset.iloc[:,-1].values

In [8]:
from summarytools import dfSummary
dfSummary(dataset)

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,Unnamed: 0 [int64],Mean (sd) : 5280.4 (3038.5) min < med < max: 2.0 < 5271.0 < 10557.0 IQR (CV) : 5226.5 (1.7),"6,335 distinct values",,0 (0.0%)
2,title [object],1. OnPolitics | 's politics blog 2. Michael Moore Owes Me $4.99 3. Hillary’s “Big Tent” is Obama’ 4. Get Ready For Civil Unrest: Su 5. The Dark Agenda Behind Globali 6. Donald Trump is blatantly raci 7. Guardian Front Page: “A 16-Yea 8. Schools All Over America Are C 9. Tony Blair suggests a second r 10. Saudi ambassador to the UAE: A 11. other,"5 (0.1%) 3 (0.0%) 3 (0.0%) 3 (0.0%) 2 (0.0%) 2 (0.0%) 2 (0.0%) 2 (0.0%) 2 (0.0%) 2 (0.0%) 6,309 (99.6%)",,0 (0.0%)
3,text [object],"1. Killing Obama administration r 2. 3. A verdict in 2017 could have s 4. The election in 232 photos, 43 5. On this day in 1973, J. Fred B 6. Top Dems want White House to c 7. Click Here To Learn More About 8. Notable names include Ray Wash 9. ""One should not insist on nail 10. Leave a Reply Click here to ge 11. other","58 (0.9%) 36 (0.6%) 23 (0.4%) 17 (0.3%) 12 (0.2%) 12 (0.2%) 10 (0.2%) 9 (0.1%) 8 (0.1%) 8 (0.1%) 6,142 (97.0%)",,0 (0.0%)
4,label [object],1. REAL 2. FAKE,"3,171 (50.1%) 3,164 (49.9%)",,0 (0.0%)
5,new_label [int64],Mean (sd) : 0.5 (0.5) min < med < max: 0.0 < 1.0 < 1.0 IQR (CV) : 1.0 (1.0),2 distinct values,,0 (0.0%)


In [9]:
X[:,0]
# X[0:3,1]

array(['You Can Smell Hillary’s Fear',
       'Watch The Exact Moment Paul Ryan Committed Political Suicide At A Trump Rally (VIDEO)',
       'Kerry to go to Paris in gesture of sympathy', ...,
       'Anti-Trump Protesters Are Tools of the Oligarchy     : Information',
       'In Ethiopia, Obama seeks progress on peace, security in East Africa',
       "Jeb Bush Is Suddenly Attacking Trump. Here's Why That Matters"],
      dtype=object)

# 

In [10]:
from gensim.models import Word2Vec
from gensim.utils import  simple_preprocess

tokenized_text = [simple_preprocess(text) for text in X[:,0]+' '+X[:,1]]
# tokenized_text
# Signature: simple_preprocess(doc, deacc=False, min_len=2, max_len=15)
# Docstring:
# Convert a document into a list of lowercase tokens, ignoring tokens 
# that are too short or too long. 

word2vec_model = Word2Vec(sentences=tokenized_text,min_count=1)
# min_count=1, which means words that appear only once will also be included in the vocabulary.
X_word2vec = []
for i in tokenized_text:
    vectors = [word2vec_model.wv[word] for word in i if word in word2vec_model.wv]
    if vectors:
        X_word2vec.append(sum(vectors)/len(vectors))
    else:
        X_word2vec.append([0] * word2vec.vector_size)
X_word2vec = np.array(X_word2vec)

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_word2vec,y,test_size=0.2,random_state=0)

In [12]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier()
clf.fit(X_train,y_train)

GradientBoostingClassifier()

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
scaled_X_train = sc.fit_transform(X_train)
scaled_X_test = sc.transform(X_test)
nlf = MultinomialNB()
nlf.fit(scaled_X_train,y_train)

MultinomialNB()

In [14]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 50, criterion='entropy', random_state=0 )
rf.fit(X_train,y_train)

RandomForestClassifier(criterion='entropy', n_estimators=50, random_state=0)

In [15]:
y_pred_rf = rf.predict(X_test) #Random Forest

In [16]:
y_pred = clf.predict(X_test) # gradientboosting

In [17]:
y_pred_nv = nlf.predict(scaled_X_test) # naive bayes

In [18]:
y_pred

array([1, 1, 0, ..., 1, 1, 0], dtype=int64)

In [19]:
y_pred_nv

array([1, 1, 0, ..., 0, 1, 0], dtype=int64)

In [20]:
y_pred_rf

array([1, 1, 0, ..., 1, 1, 0], dtype=int64)

In [21]:
y_test

array([1, 0, 0, ..., 0, 1, 1], dtype=int64)

In [22]:
from sklearn.metrics import classification_report,confusion_matrix
cr = classification_report(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
print(cr)
print(cm) #Gradient Boosting

              precision    recall  f1-score   support

           0       0.87      0.86      0.86       615
           1       0.87      0.88      0.87       652

    accuracy                           0.87      1267
   macro avg       0.87      0.87      0.87      1267
weighted avg       0.87      0.87      0.87      1267

[[526  89]
 [ 80 572]]


In [23]:
from sklearn.metrics import classification_report,confusion_matrix
crnv = classification_report(y_test,y_pred_nv)
cmnv = confusion_matrix(y_test,y_pred_nv)
print(crnv)
print(cmnv) # Multinomial Naive bayes

              precision    recall  f1-score   support

           0       0.72      0.83      0.77       615
           1       0.81      0.70      0.75       652

    accuracy                           0.76      1267
   macro avg       0.77      0.76      0.76      1267
weighted avg       0.77      0.76      0.76      1267

[[510 105]
 [197 455]]


In [24]:
from sklearn.metrics import classification_report,confusion_matrix
crrf = classification_report(y_test,y_pred_rf)
cmrf = confusion_matrix(y_test,y_pred_rf)
print(crrf)
print(cmrf) #Random Forest

              precision    recall  f1-score   support

           0       0.84      0.85      0.85       615
           1       0.86      0.85      0.85       652

    accuracy                           0.85      1267
   macro avg       0.85      0.85      0.85      1267
weighted avg       0.85      0.85      0.85      1267

[[524  91]
 [101 551]]


In [25]:
predict_labels = ["FAKE" if pred == 0 else "REAL" for pred in y_pred]

In [26]:
def predict_label(title,text):
    token_title = simple_preprocess(title)
    token_text = simple_preprocess(text)
    
    title_vectors = [word2vec_model.wv[word] for word in token_title if word in word2vec_model.wv]
    text_vectors = [word2vec_model.wv[word] for word in token_text if word in word2vec_model.wv]
    
    title_avg_vectors = sum(title_vectors)/len(title_vectors) if title_vectors else [0]*50
    text_avg_vectors = sum(text_vectors)/len(text_vectors) if text_vectors else [0]*50
    
    input_vectors = np.concatenate((title_avg_vectors,text_avg_vectors))
    input_vectors = input_vectors[:100]    
    input_vectors = input_vectors.reshape(1,-1)
    
    prediction = clf.predict(input_vectors)
    
    label = "FAKE" if prediction == 0 else "REAL"
    
    return label

In [27]:
title12 = "State Department will not release 22 'top secret' Clinton emails"
text12 = '''"(CNN) The State Department announced Friday that it will not release 22 emails from former Secretary of State Hillary Clinton because they contain ""top secret"" information, the highest level of government classification.

The decision, coming three days before the Iowa caucuses, could provide fodder for Clinton's political opponents, especially Republicans, who are likely to make note of the emails' ""top secret"" designation. Clinton's email use has haunted her on the campaign trail since it became public early last year that she maintained a private server while leading the State Department.

State Department spokesman John Kirby said the documents, totaling 37 pages, were not marked classified at the time they were sent, but are being upgraded at the request of the Intelligence Community because they contain sensitive information.

But, Kirby said, a separate review by the bureaus of Diplomatic Security and Intelligence and Research is being held into whether the information in the emails was classified at the time they were sent and received. He would not say when the review began or how long it would go, and acknowledged it's possible there could be classified emails that weren't marked as such.

""It's certainly possible that for any number of reasons, traffic can be sent that's not marked appropriately for its classification. That is certainly possible,"" Kirby said.

But he added that he wasn't going to make any judgments about this particular case.

""All I can tell you definitively is it wasn't marked classified at the time it was sent,"" Kirby said.

A senior State Department official said the review ""began very recently"" and was initiated by the State Department, but the official wouldn't say what prompted it.

A spokesperson for the Intelligence Community's inspector general declined to comment.

Kirby also said 18 emails, comprised of eight email chains between Clinton and President Barack Obama, are being ""withheld in full"" to ""protect the President's ability to receive unvarnished advice and counsel."" But, Kirby said, they ""have not been determined to be classified"" and said they will ""ultimately be released in accordance with the Presidential records act.""

""I'd love for people to see what I did and I hope that will happen,"" she said.

Brian Fallon, a spokesman for Clinton's campaign, said in a statement that Friday's announcement was a case of ""over-classification run amok"" and reiterated Clinton's position that the emails be made public.

But later Friday, Fallon declined to say whether Clinton would ask Obama to declassify the emails when pressed by CNN's Wolf Blitzer on ""The Situation Room.""

""The President easily could declassify all of these emails if she asked him and if he agreed, right?"" Blitzer asked.

""I'd really be surprised if this has risen to the President's level,"" Fallon replied. ""Because, again, this a mundane matter of fulfilling a FOIA request.""

Asked Friday if he had ""certainty and confidence"" that Clinton will not be indicted over the email controversy, White House Press Secretary Josh Earnest said any decision to prosecute Clinton would rest with the Justice Department.

""That is a decision to be made solely by independent prosecutors,"" Earnest said. ""But again, based on what we know from the Department of Justice, it does not seem to be headed in that direction.""

The State Department released more than 900 of Clinton's emails Friday -- 242 of which received classification upgrades: 11 to ""secret"" and 209 more to ""confidential,"" along with the 22 emails containing ""top secret"" information -- but the release fell well behind the judge-imposed timetable for producing all of her emails.

Among the most interesting correspondence:

This month's release was supposed to be the final one and include just over 9,000 pages of documents -- the largest number to date.

That delay was then compounded by a huge snowstorm that shut down the federal government for several days, according to the State Department's motion.

Several prominent Republicans, including presidential hopefuls, quickly condemned Clinton, the Democratic 2016 front-runner, over Friday's developments.

""The new e-mail release is a disaster for Hillary Clinton. At a minimum, how can someone with such bad (judgment) be our next president?"" GOP front-runner Donald Trump tweeted.

Florida Sen. Marco Rubio said Clinton's email use was a ""disqualifier"" for the White House.

""Hillary Clinton put some of the highest, most sensitive intelligence information on her private server because maybe she thinks she's above the law,"" Rubio said at a town hall event in Clinton, Iowa.

Texas Sen. Ted Cruz told conservative radio host Hugh Hewitt that Clinton's email controversy would seriously imperil her presidential aspirations.

""We are talking about serious offenses for which the Obama Justice Department threw the book at General (David) Petraeus,"" Cruz said. ""And justice needs to be enforced fairly and impartially.""

And Republican National Committee Chairman Reince Priebus tweeted that Clinton and the Obama administration have ""obfuscated and misled at every available opportunity,"" adding that she has ""removed all doubt that she cannot be trusted with the presidency.""

But Rep. Adam Schiff, D-California, the ranking Democrat on the House Intelligence Committee, said classification determinations ""are often very complex.""

""It's important to remember that none of these emails had any classification markings at the time they were sent, and Secretary Clinton and her staff were responding to world events in real time without the benefit of months of analysis after the fact,"" Schiff said.

Meanwhile, Clinton's top Democratic 2016 rival, Vermont Sen. Bernie Sanders, said in a statement that ""there is a legal process in place which should proceed and not be politicized.""

""The American people are sick and tired of hearing about your damn emails,"" he said then to applause."

'''
predicted_label = predict_label(title12, text12)
print("Predicted Label:", predicted_label)

Predicted Label: REAL


In [28]:
import gradio as gr

In [29]:
# title_input = gr.inputs.Textbox(lines = 1,label = "Title")
# text_input = gr.inputs.Textbox(lines = 10,label = "Text")
# output_label = gr.outputs.Textbox(label = "Predictied Label")

interface = gr.Interface(fn= predict_label,
            inputs=["text", "text"],
            outputs="text",
            title="FAKE NEWS DETECTION",
            description="Enter the title and text of the news to predict whether it is fake or real.")
interface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


