# Program to identify when an article might be fake news. 

### Dataset was taken from Kaggle competition 

In [12]:
import pandas as pd

In [37]:
from sklearn.model_selection import train_test_split

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [43]:
from sklearn.tree import DecisionTreeClassifier

##### Import the list of stopwords (for example: and, the, is) from the nltk.corpus:

In [103]:
from nltk.corpus import stopwords

In [104]:
import re

In [105]:
import pickle

##### Imports the PorterStemmer - a tool, which reduces words to their root form (e.g., "running" → "run"):

In [23]:
from nltk.stem.porter import PorterStemmer

In [13]:
df=pd.read_csv("train.csv")

In [14]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [15]:
df.describe()

Unnamed: 0,id,label
count,20800.0,20800.0
mean,10399.5,0.500625
std,6004.587135,0.500012
min,0.0,0.0
25%,5199.75,0.0
50%,10399.5,1.0
75%,15599.25,1.0
max,20799.0,1.0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [17]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

##### Handling missing value:

In [106]:
df=df.fillna('')

In [19]:
df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [20]:
df=df.drop(['id', 'title', 'author'], axis=1)

In [21]:
df.head()

Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1
3,Videos 15 Civilians Killed In Single US Airstr...,1
4,Print \nAn Iranian woman has been sentenced to...,1


In [87]:
port_stem=PorterStemmer()

In [91]:
port_stem.stem("Hello, my name is Anna * %@@@ %")

'hello, my name is anna * %@@@ %'

In [92]:
def stemming(content):
    con=re.sub('[^a-zA-Z]', ' ', content)
    con=con.lower()
    con=con.split()
    filtered_words = []  
    for word in con:
        if word not in stopwords.words('english'):  
            stemmed_word = port_stem.stem(word)    # Apply stemming to the word
            filtered_words.append(stemmed_word)    

    con = filtered_words  # Update `con` with the filtered and stemmed words
    con=' '.join(con)
    return con

In [93]:
stemming("Hello, my name is Anna")

'hello name anna'

In [29]:
df['text']=df['text'].apply(stemming)

In [34]:
x=df['text']

In [35]:
y=df['label']

In [36]:
y.shape

(20800,)

In [38]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [40]:
vect = TfidfVectorizer()

In [41]:
x_train = vect.fit_transform(x_train)
x_test = vect.transform(x_test)

In [42]:
x_test.shape

(5200, 94026)

##### This is lassification task, so we can use a decision tree:

In [44]:
model=DecisionTreeClassifier()

In [45]:
model.fit(x_train, y_train)

In [46]:
prediction=model.predict(x_test)

In [60]:
prediction

array([1, 0, 1, ..., 0, 0, 1])

##### Model accuracy:

In [61]:
model.score(x_test, y_test)

0.8738461538461538

##### Saves the trained TfidfVectorizer object into a file:

In [63]:
pickle.dump(vect, open('vector.pkl', 'wb'))

##### Saves the trained decision tree model:

In [64]:
pickle.dump(model, open('model.pkl', 'wb'))

In [65]:
vector_form=pickle.load(open('vector.pkl', 'rb'))

In [94]:
load_model=pickle.load(open('model.pkl', 'rb'))

In [109]:
def faking_news(news):
    news=stemming(news)
    input_data=[news]
    form1_vector=vector_form.transform(input_data)
    prediction=load_model.predict(form1_vector)
    return prediction

In [110]:
faking_news("""Wars are endangering the United States and other participating nations rather than protecting them. Nonviolent tools of law, diplomacy, aid, crisis prevention, and verifiable nuclear disarmament should be substituted for continuing counterproductive wars. Therefore, we, the undersigned, call for the immediate cancellation of the F-35 program as a whole, and the immediate cancellation of plans to base any such dangerous and noisy jets near populated areas. We oppose replacing the F-35 with any other weapon or basing the F-35 in any other locations. We further demand redirection of the money for the F-35 back into taxpayersвЂ™ pockets, and into environmental and human needs in the U.S., other F-35 customer nations, and around the world, including to fight climate change, pay off student debt, rebuild crumbling infrastructure, and improve education, healthcare, and housing.  """)

array([1])

In [111]:
output=faking_news("""In these trying times, Jackie Mason is the Voice of Reason. [In this week’s exclusive clip for Breitbart News, Jackie discusses the looming threat of North Korea, and explains how President Donald Trump could win the support of the Hollywood left if the U. S. needs to strike first.  “If he decides to bomb them, the whole country will be behind him, because everybody will realize he had no choice and that was the only thing to do,” Jackie says. “Except the Hollywood left. They’ll get nauseous. ” “[Trump] could win the left over, they’ll fall in love with him in a minute. If he bombed them for a better reason,” Jackie explains. “Like if they have no transgender toilets. ” Jackie also says it’s no surprise that Hollywood celebrities didn’t support Trump’s strike on a Syrian airfield this month. “They were infuriated,” he says. “Because it might only save lives. That doesn’t mean anything to them. If it only saved the environment, or climate change! They’d be the happiest people in the world. ” Still, Jackie says he’s got nothing against Hollywood celebs. They’ve got a tough life in this country. Watch Jackie’s latest clip above.   Follow Daniel Nussbaum on Twitter: @dznussbaum """)

In [113]:
if output==[0]:
    print("Reliable")
else:
    print("Unreliable")

Reliable
