In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [3]:
# Pull fake news and real news datasets
fake_df = pd.read_csv("../data/BuzzFeed_fake_news_content.csv")
real_df = pd.read_csv("../data/BuzzFeed_real_news_content.csv")

In [20]:
# Add a column to label whether news is real (0) or fake (1)
fake_df['label'] = 1
real_df['label'] = 0

In [22]:
# Combine the two dataframes into one dataframe
df = pd.concat([fake_df, real_df], ignore_index=True)

In [23]:
df

Unnamed: 0,id,title,text,url,top_img,authors,source,publish_date,movies,images,canonical_link,meta_data,label
0,Fake_1-Webpage,Proof The Mainstream Media Is Manipulating The...,I woke up this morning to find a variation of ...,http://www.addictinginfo.org/2016/09/19/proof-...,http://addictinginfo.addictinginfoent.netdna-c...,Wendy Gittleson,http://www.addictinginfo.org,{'$date': 1474243200000},,"http://i.imgur.com/JeqZLhj.png,http://addictin...",http://addictinginfo.com/2016/09/19/proof-the-...,"{""publisher"": ""Addicting Info | The Knowledge ...",1
1,Fake_10-Webpage,Charity: Clinton Foundation Distributed “Water...,Former President Bill Clinton and his Clinton ...,http://eaglerising.com/36899/charity-clinton-f...,http://eaglerising.com/wp-content/uploads/2016...,View All Posts,http://eaglerising.com,{'$date': 1474416521000},,http://constitution.com/wp-content/uploads/201...,http://eaglerising.com/36899/charity-clinton-f...,"{""description"": ""The possibility that CHAI dis...",1
2,Fake_11-Webpage,A Hillary Clinton Administration May be Entire...,After collapsing just before trying to step in...,http://eaglerising.com/36880/a-hillary-clinton...,http://eaglerising.com/wp-content/uploads/2016...,"View All Posts,Tony Elliott",http://eaglerising.com,{'$date': 1474416638000},,http://constitution.com/wp-content/uploads/201...,http://eaglerising.com/36880/a-hillary-clinton...,"{""description"": ""Hillary Clinton may be the fi...",1
3,Fake_12-Webpage,Trump’s Latest Campaign Promise May Be His Mos...,"Donald Trump is, well, deplorable. He’s sugges...",http://www.addictinginfo.org/2016/09/19/trumps...,http://addictinginfo.addictinginfoent.netdna-c...,John Prager,http://www.addictinginfo.org,{'$date': 1474243200000},,"http://i.imgur.com/JeqZLhj.png,http://2.gravat...",http://addictinginfo.com/2016/09/19/trumps-lat...,"{""publisher"": ""Addicting Info | The Knowledge ...",1
4,Fake_13-Webpage,Website is Down For Maintenance,Website is Down For Maintenance,http://www.proudcons.com/clinton-foundation-ca...,,,http://www.proudcons.com,,,,,"{""og"": {""url"": ""http://www.proudcons.com"", ""ty...",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,Real_88-Webpage,It’s “Trump is HITLER” Month at the Washington...,"Like much of the mainstream media, the Washing...",http://eaglerising.com/36955/its-trump-is-hitl...,http://eaglerising.com/wp-content/uploads/2016...,"View All Posts,Jeff Dunetz",http://eaglerising.com,{'$date': 1474526406000},,http://constitution.com/wp-content/uploads/201...,http://eaglerising.com/36955/its-trump-is-hitl...,"{""description"": ""Now they are taking the lazy,...",0
178,Real_89-Webpage,Obama’s team isn’t laughing at Trump anymore,2016 Obama’s team isn’t laughing at Trump anym...,http://politi.co/2cW2vAD,http://v.politico.com/images/1155968404/201609...,"Edward-isaac Dovere,Eli Stokols,Politico Staff...",http://politi.co,{'$date': 1474866060000},,http://v.politico.com/images/1155968404/201609...,http://www.politico.com/story/2016/09/barack-o...,"{""description"": ""The president\u2019s aides wo...",0
179,Real_9-Webpage,"Georgia poll: Donald Trump, Hillary Clinton in...","Story highlights Trump has 45%, Clinton 42% an...",http://cnn.it/2cynaZx,http://i2.cdn.cnn.com/cnnnext/dam/assets/16091...,Tal Kopan,http://cnn.it,,,http://i2.cdn.cnn.com/cnnnext/dam/assets/16102...,http://www.cnn.com/2016/09/19/politics/georgia...,"{""description"": ""Donald Trump is up 3 points o...",0
180,Real_90-Webpage,Chelsea Handler Gets The Last Word After RNC C...,There may be a few women out there who enjoy a...,http://www.addictinginfo.org/2016/09/19/chelse...,http://addictinginfo.addictinginfoent.netdna-c...,,http://www.addictinginfo.org,{'$date': 1474243200000},,"http://i.imgur.com/JeqZLhj.png,https://d5nxst8...",http://addictinginfo.com/2016/09/19/chelsea-ha...,"{""publisher"": ""Addicting Info | The Knowledge ...",0


In [25]:
# Shuffle dataframe so the fake and real news isn't separated
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


In [26]:
df

Unnamed: 0,id,title,text,url,top_img,authors,source,publish_date,movies,images,canonical_link,meta_data,label
0,Fake_27-Webpage,How Democrats are Going to Try to STEAL the El...,In every state there is a law governing the me...,http://eaglerising.com/37044/how-democrats-are...,http://eaglerising.com/wp-content/uploads/2016...,"View All Posts,Dale Summitt",http://eaglerising.com,{'$date': 1474874132000},,http://constitution.com/wp-content/uploads/201...,http://eaglerising.com/37044/how-democrats-are...,"{""description"": ""The Democratic Party not only...",1
1,Fake_48-Webpage,Tebow DISMANTLES national anthem protests in o...,20.1k SHARES Facebook Twitter\n\nWhat do Tim T...,http://freedomdaily.com/tebow-dismantles-natio...,http://freedomdaily.com/wp-content/uploads/201...,,http://freedomdaily.com,{'$date': 1474577949000},,http://1csabj4ddrd61fgqez2e4nss.wpengine.netdn...,http://freedomdaily.com/tebow-dismantles-natio...,"{""googlebot"": ""noimageindex"", ""generator"": ""Po...",1
2,Real_67-Webpage,Terrorist Attacks Will Likely Affect 2016 Pres...,The recent connected bombings in New York and ...,http://abcn.ws/2cDjJ92,http://a.abcnews.com/images/US/AP_Explosion3_m...,"More Meghan,Abc News",http://abcn.ws,,,http://a.abcnews.com/images/US/AP_Manhattan_Ex...,http://abcnews.go.com/Politics/terrorist-attac...,"{""fb_title"": ""Terrorist Attacks Likely to Affe...",0
3,Real_16-Webpage,Young Girl's Emotional Council Speech Laments ...,Peaceful protesters crowded Charlotte's first ...,http://abcn.ws/2cTj7ap,http://a.abcnews.com/images/US/AP_Charlotte_Po...,"More Michael,Abc News",http://abcn.ws,,,http://a.abcnews.com/images/US/AP_Charlotte_Po...,http://abcnews.go.com/US/young-girls-emotional...,"{""fb_title"": ""Young Girl's Emotional Speech La...",0
4,Real_6-Webpage,Obama weighs in on the debate,Obama weighs in on the debate\n\nPresident Bar...,http://politi.co/2dpdeXp,http://v.politico.com/images/1155968404/201609...,"Brianna Ehley,Jack Shafer",http://politi.co,{'$date': 1474984149000},,http://v.politico.com/images/1155968404/201609...,http://www.politico.com/story/2016/09/trump-de...,"{""description"": ""Obama touted Clinton\u2019s p...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,Real_23-Webpage,Dem congressman to IRS: Audit the Trump Founda...,"Story highlights Trump reportedly used $258,00...",http://cnn.it/2diatmB,http://i2.cdn.cnn.com/cnnnext/dam/assets/16092...,Tom Lobianco,http://cnn.it,,,http://i2.cdn.cnn.com/cnnnext/dam/assets/16092...,http://www.cnn.com/2016/09/21/politics/levin-i...,"{""description"": ""Michigan Democratic Rep. Sand...",0
178,Fake_22-Webpage,Hillary Denies She and Obama Founded ISIS…Then...,It was late one night in the White House when ...,,,,,,,,,{},1
179,Real_10-Webpage,"Donald Trump: Drugs a 'Very, Very Big Factor' ...",Less than a day after protests over the police...,http://abcn.ws/2d4lNn9,http://a.abcnews.com/images/Politics/AP_donald...,"More Candace,Adam Kelsey,Abc News,More Adam",http://abcn.ws,,,http://www.googleadservices.com/pagead/convers...,http://abcnews.go.com/Politics/donald-trump-dr...,"{""fb_title"": ""Trump: Drugs a 'Very, Very Big F...",0
180,Real_9-Webpage,"Georgia poll: Donald Trump, Hillary Clinton in...","Story highlights Trump has 45%, Clinton 42% an...",http://cnn.it/2cynaZx,http://i2.cdn.cnn.com/cnnnext/dam/assets/16091...,Tal Kopan,http://cnn.it,,,http://i2.cdn.cnn.com/cnnnext/dam/assets/16102...,http://www.cnn.com/2016/09/19/politics/georgia...,"{""description"": ""Donald Trump is up 3 points o...",0


In [39]:
# Combine news title and text together into a new column
# Replace all NaN elements with an empty string
df['combined_text'] = df['title'].fillna('') + ' ' + df['text'].fillna('')

In [40]:
# Predict label based on news title and text
X = df['combined_text']
y = df['label']

In [41]:
# Split dataframe into 80/20 train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [42]:
# Create matrix with TF-IDF scores to determine the relative importance of a word
vectorizer = TfidfVectorizer(
    stop_words='english', 
    max_features=500
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [43]:
# Train logisitic regression model with 1000 iterations
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

In [44]:
# Test model on test set
y_pred = model.predict(X_test_tfidf)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.74      0.76        19
           1       0.74      0.78      0.76        18

    accuracy                           0.76        37
   macro avg       0.76      0.76      0.76        37
weighted avg       0.76      0.76      0.76        37

