# Merge data
* Data format is following Hyesoo's format
* All data is exported to a csv file

In [1]:
import pandas as pd
import os

## Import Additional(Renata) data

In [2]:
path = os.path.join('data', 'Additional_fake_df.csv')
df_add_fakenews = pd.read_csv(path,usecols=[1,2,3,4,5,6])

## Import Jinmei's data

### import fake news

In [3]:
path = os.path.join('data', 'fakenews_jz.csv')
df_fakenews = pd.read_csv(path,usecols=[1,2,3,4,5])
df_fakenews['authenticity'] = 1

### import real news

In [4]:
path = os.path.join('data', 'realnews_jz.csv')
df_realnews = pd.read_csv(path,usecols=[0,1,2,3,4])
df_realnews['authenticity'] = 0

### merge fake and real news

In [5]:
df_jinmei = pd.concat([df_fakenews, df_realnews], ignore_index=True)
df_jinmei.tail()

Unnamed: 0,url,source,title,author,text,authenticity
1271,http://www.newyorker.com/news/news-desk,newyorker,"News Desk: Breaking News, Reporting, and Polit...",[],,0
1272,http://www.newyorker.com/cartoon/dernavich-201...,newyorker,A Cartoon from The New Yorker,[],,0
1273,http://tunein.com/radio/New-Yorker-Poetry-p803...,newyorker,New Yorker Poetry,[],Yusef Komunyakaa reads a poem by Marilyn Hacke...,0
1274,http://video.newyorker.com/watch/shorts-murmur...,newyorker,The Startup to End All Startups,[],"The Startup to End All Startups\n\nMeet uBox, ...",0
1275,http://www.newyorker.com/humor/borowitz-report...,newyorker,Cruz: “The Dream of Keeping Poor People from S...,[],WASHINGTON ( The Borowitz Report )—Acknowledgi...,0


In [6]:
df_jinmei.shape

(1276, 6)

### clean data

In [7]:
# drop rows where text contains NaN
df_jinmei = df_jinmei.dropna(axis=0,subset=['text'], how='any') 
df_jinmei.tail()

Unnamed: 0,url,source,title,author,text,authenticity
1268,http://www.newyorker.com/humor/borowitz-report...,newyorker,Stephen Hawking Angers Trump Supporters with B...,[],LONDON ( The Borowitz Report )—The theoretical...,0
1269,http://tunein.com/radio/New-Yorker-Radio-Hour-...,newyorker,The New Yorker Radio Hour,[],Description:\n\nDavid Remnick is joined by The...,0
1273,http://tunein.com/radio/New-Yorker-Poetry-p803...,newyorker,New Yorker Poetry,[],Yusef Komunyakaa reads a poem by Marilyn Hacke...,0
1274,http://video.newyorker.com/watch/shorts-murmur...,newyorker,The Startup to End All Startups,[],"The Startup to End All Startups\n\nMeet uBox, ...",0
1275,http://www.newyorker.com/humor/borowitz-report...,newyorker,Cruz: “The Dream of Keeping Poor People from S...,[],WASHINGTON ( The Borowitz Report )—Acknowledgi...,0


In [8]:
df_jinmei.shape

(1213, 6)

In [9]:
# drop rows where the length of characters in text is less than 500
df_jinmei = df_jinmei[df_jinmei.text.map(len) > 500]
df_jinmei.shape

(822, 6)

## Import Hyesoo's data

In [10]:
path = os.path.join('data', 'hyesoo_df.csv')
df_hyesoo = pd.read_csv(path,usecols=[1,2,3,4,5,6]) 
df_hyesoo.tail()

Unnamed: 0,url,source,title,author,text,authenticity
1736,https://www.ice.gov/news/releases/operation-ma...,politico,Operation Matador nets 39 MS-13 arrests in las...,[],NEW YORK – U.S. Immigration and Customs Enforc...,1
1737,http://www.politico.com/story/2017/07/27/obama...,politico,Senate Republicans prepare to pass Obamacare r...,"['John Bresnahan', 'Burgess Everett', 'Jennife...",Senate Republicans are closing in on passage o...,1
1738,https://www.nytimes.com/2017/07/26/technology/...,nytimes,Google’s New Parental Control App Has a Flaw: ...,"['Brian X. Chen', 'Tech Fix']",“The fact that the kid can graduate themselves...,1
1739,http://www.foxnews.com/entertainment/2017/07/2...,foxnews,Hulu resurrects TGIF lineup with acquisition o...,['Tyler Mccarthy'],Hulu is hoping to make itself the go-to stream...,1
1740,http://www.npr.org/2017/07/27/539559582/5-unan...,npr,5 Unanswered Questions About Trump's 'Ban' On ...,['Philip Ewing'],5 Unanswered Questions About Trump's 'Ban' On ...,1


In [11]:
path = os.path.join('data', 'hyesoo_df.csv')
df_hyesoo = pd.read_csv(path,usecols=[1,2,3,4,5,6]) 
swap_map = {1:0, 0:1}
df_hyesoo.authenticity = df_hyesoo.authenticity.map(swap_map)
df_hyesoo.tail()

Unnamed: 0,url,source,title,author,text,authenticity
1736,https://www.ice.gov/news/releases/operation-ma...,politico,Operation Matador nets 39 MS-13 arrests in las...,[],NEW YORK – U.S. Immigration and Customs Enforc...,0
1737,http://www.politico.com/story/2017/07/27/obama...,politico,Senate Republicans prepare to pass Obamacare r...,"['John Bresnahan', 'Burgess Everett', 'Jennife...",Senate Republicans are closing in on passage o...,0
1738,https://www.nytimes.com/2017/07/26/technology/...,nytimes,Google’s New Parental Control App Has a Flaw: ...,"['Brian X. Chen', 'Tech Fix']",“The fact that the kid can graduate themselves...,0
1739,http://www.foxnews.com/entertainment/2017/07/2...,foxnews,Hulu resurrects TGIF lineup with acquisition o...,['Tyler Mccarthy'],Hulu is hoping to make itself the go-to stream...,0
1740,http://www.npr.org/2017/07/27/539559582/5-unan...,npr,5 Unanswered Questions About Trump's 'Ban' On ...,['Philip Ewing'],5 Unanswered Questions About Trump's 'Ban' On ...,0


## Merge all data

In [12]:
df_merge = pd.concat([df_jinmei, df_hyesoo, df_add_fakenews], ignore_index=True)
df_merge.tail()

Unnamed: 0,url,source,title,author,text,authenticity
3035,https://sputniknews.com/columnists/20170713105...,sputniknews,Russia 'Collusion' Smoking Gun?,[],"This is the proof, we are told, that the Trump...",1
3036,http://pamelageller.com/2017/07/state-departme...,pamelageller,State Department Removes Word ‘Genocide’ From ...,"['Geller Report Staff', 'Mark Steiner', 'David...",The U.S. State Department under President Dona...,1
3037,http://www.linkiesta.it/it/article/2017/04/27/...,pamelageller,"Luca Ricolfi: ""La vecchia sinistra è rimasta s...","['Alessandro Franzi', 'Di Euvisions', 'A Cura ...","La sinistra non sa più dare protezione. ""Ed è ...",1
3038,https://sputniknews.com/science/20170801105605...,sputniknews,To the Stars We Return: Start-Ups Offering Spa...,[],A Houston-based startup will send your cremate...,1
3039,https://sputniknews.com/radio_trendstorm/20170...,sputniknews,China & India: Walking the Himalayan Tightrope,[],China & India: Walking the Himalayan Tightrope...,1


In [13]:
df_merge.shape

(3040, 6)

In [14]:
df_merge = df_merge.sample(frac=1).reset_index(drop=True)

In [15]:
f = df_merge[df_merge.authenticity == 1].count()
t = df_merge[df_merge.authenticity == 0].count()
print("total number of fake news is {} and that of true news is {}".format(f,t))

total number of fake news is url             1474
source          1474
title           1474
author          1474
text            1474
authenticity    1474
dtype: int64 and that of true news is url             1566
source          1566
title           1566
author          1566
text            1566
authenticity    1566
dtype: int64


## Export data to csv file

In [16]:
path = os.path.join('data','merged_data.csv')
df_merge.to_csv(path)  

In [17]:
# test
df_test = pd.read_csv(path) 
df_test.tail(n=10)

Unnamed: 0.1,Unnamed: 0,url,source,title,author,text,authenticity
3030,3030,http://beforeitsnews.com/economy/2017/07/the-f...,beforeitsnews,The Moment of Truth is Upon Us! The Fed Delays...,['Due Diligence'],The Moment of Truth is Upon Us! The Fed Delays...,1
3031,3031,http://www.cnn.com/travel/article/catskills-ne...,cnn,Catskills see big revival as New York vacation...,[],"Alexandra Marvar, CNN • Updated 27th July 2017...",0
3032,3032,http://www.cbsnews.com/news/andy-cohen-on-meet...,cbsnews,"Andy Cohen on meeting President Trump, getting...",['Andrea Park'],TV personality Andy Cohen is known for his rea...,0
3033,3033,http://www.politico.eu/article/populist-italia...,pamelageller,Populist Italian marriage to give Brussels hea...,"['Giada Zampano', 'Jacopo Barigazzi', 'Helen C...",ROME — Italy’s next government could see a pop...,1
3034,3034,http://www.dailymail.co.uk/news/article-472061...,wordpress,Top general blames New York Times for ISIS lea...,['Keith Griffith For Dailymail.Com'],A top US general has blamed leaks to the New Y...,1
3035,3035,https://www.nytimes.com/2017/07/27/us/politics...,nytimes,"With New Sanctions, Senate Forces Trump’s Hand...",['Matt Flegenheimer'],“The administration supports sanctions against...,0
3036,3036,http://www.cbsnews.com/news/raw-chicken-should...,cbsnews,Does washing raw chicken make it safer to eat?,['Bianca Seidman'],Though it may seem like good hygiene to wash c...,0
3037,3037,http://www.dailywire.com/news/18742/fake-news-...,wordpress,Fake News Freefall: CNN Slips Waaaaaay Behind ...,[],"Another mammoth news week, another ratings cat...",1
3038,3038,http://www.reuters.com/article/us-usa-trump-pr...,reuters,Top Trump lieutenant Scaramucci lashes colleag...,['Steve Holland'],(Note: Strong language in paragraph 3)\n\nBy S...,0
3039,3039,http://president45donaldtrump.com/unemployment...,president45donaldtrump,Unemployment rate drops to 5.1% as more people...,[],There comes a certain point in time when manip...,1
