# Merge data
* Data format is following Hyesoo's format
* All data is exported to a csv file

In [1]:
import pandas as pd
import os

## Import Jinmei's data

### import fake news

In [2]:
path = os.path.join('data', 'fakenews_jz.csv')
df_fakenews = pd.read_csv(path,usecols=[1,2,3,4,5])
df_fakenews['authenticity'] = 1

### import real news

In [3]:
path = os.path.join('data', 'realnews_jz.csv')
df_realnews = pd.read_csv(path,usecols=[0,1,2,3,4])
df_realnews['authenticity'] = 0

### merge fake and real news

In [4]:
df_jinmei = pd.concat([df_fakenews, df_realnews], ignore_index=True)
df_jinmei.tail()

Unnamed: 0,url,source,title,author,text,authenticity
1271,http://www.newyorker.com/news/news-desk,newyorker,"News Desk: Breaking News, Reporting, and Polit...",[],,0
1272,http://www.newyorker.com/cartoon/dernavich-201...,newyorker,A Cartoon from The New Yorker,[],,0
1273,http://tunein.com/radio/New-Yorker-Poetry-p803...,newyorker,New Yorker Poetry,[],Yusef Komunyakaa reads a poem by Marilyn Hacke...,0
1274,http://video.newyorker.com/watch/shorts-murmur...,newyorker,The Startup to End All Startups,[],"The Startup to End All Startups\n\nMeet uBox, ...",0
1275,http://www.newyorker.com/humor/borowitz-report...,newyorker,Cruz: “The Dream of Keeping Poor People from S...,[],WASHINGTON ( The Borowitz Report )—Acknowledgi...,0


In [5]:
df_jinmei.shape

(1276, 6)

### clean data

In [6]:
# drop rows where text contains NaN
df_jinmei = df_jinmei.dropna(axis=0,subset=['text'], how='any') 
df_jinmei.tail()

Unnamed: 0,url,source,title,author,text,authenticity
1268,http://www.newyorker.com/humor/borowitz-report...,newyorker,Stephen Hawking Angers Trump Supporters with B...,[],LONDON ( The Borowitz Report )—The theoretical...,0
1269,http://tunein.com/radio/New-Yorker-Radio-Hour-...,newyorker,The New Yorker Radio Hour,[],Description:\n\nDavid Remnick is joined by The...,0
1273,http://tunein.com/radio/New-Yorker-Poetry-p803...,newyorker,New Yorker Poetry,[],Yusef Komunyakaa reads a poem by Marilyn Hacke...,0
1274,http://video.newyorker.com/watch/shorts-murmur...,newyorker,The Startup to End All Startups,[],"The Startup to End All Startups\n\nMeet uBox, ...",0
1275,http://www.newyorker.com/humor/borowitz-report...,newyorker,Cruz: “The Dream of Keeping Poor People from S...,[],WASHINGTON ( The Borowitz Report )—Acknowledgi...,0


In [7]:
df_jinmei.shape

(1213, 6)

In [8]:
# drop rows where the length of characters in text is less than 500
df_jinmei = df_jinmei[df_jinmei.text.map(len) > 500]
df_jinmei.shape

(822, 6)

## Import Hyesoo's data

In [9]:
path = os.path.join('data', 'hyesoo_df.csv')
df_hyesoo = pd.read_csv(path,usecols=[1,2,3,4,5,6]) 
swap = {1:0, 0:1}
df_hyesoo.authenticity = df_hyesoo.authenticity.map(swap)
df_hyesoo.tail()

Unnamed: 0,url,source,title,author,text,authenticity
1736,https://www.ice.gov/news/releases/operation-ma...,politico,Operation Matador nets 39 MS-13 arrests in las...,[],NEW YORK – U.S. Immigration and Customs Enforc...,0
1737,http://www.politico.com/story/2017/07/27/obama...,politico,Senate Republicans prepare to pass Obamacare r...,"['John Bresnahan', 'Burgess Everett', 'Jennife...",Senate Republicans are closing in on passage o...,0
1738,https://www.nytimes.com/2017/07/26/technology/...,nytimes,Google’s New Parental Control App Has a Flaw: ...,"['Brian X. Chen', 'Tech Fix']",“The fact that the kid can graduate themselves...,0
1739,http://www.foxnews.com/entertainment/2017/07/2...,foxnews,Hulu resurrects TGIF lineup with acquisition o...,['Tyler Mccarthy'],Hulu is hoping to make itself the go-to stream...,0
1740,http://www.npr.org/2017/07/27/539559582/5-unan...,npr,5 Unanswered Questions About Trump's 'Ban' On ...,['Philip Ewing'],5 Unanswered Questions About Trump's 'Ban' On ...,0


## Import additional fake data

In [20]:
path = os.path.join('data', 'Additional_fake_df.csv')
df_add = pd.read_csv(path,usecols=[1,2,3,4,5,6]) 
df_add.drop(df_add.index[474], inplace = True)
# df_add.tail()
df_add.authenticity.value_counts()

1    476
Name: authenticity, dtype: int64

## Merge all data

In [11]:
df_merge = pd.concat([df_jinmei, df_hyesoo, df_add], ignore_index=True)
df_merge.tail()

Unnamed: 0,url,source,title,author,text,authenticity
3034,http://americanflavor.news/2017/04/30/trump-ad...,AmericanFlavor,After Trump Administration Negotiations China ...,[],A Houston woman held in custody by China in 20...,1
3035,https://sputniknews.com/columnists/20170713105...,sputniknews,Russia 'Collusion' Smoking Gun?,[],"This is the proof, we are told, that the Trump...",1
3036,http://pamelageller.com/2017/07/state-departme...,pamelageller,State Department Removes Word ‘Genocide’ From ...,"['Geller Report Staff', 'Mark Steiner', 'David...",The U.S. State Department under President Dona...,1
3037,https://sputniknews.com/science/20170801105605...,sputniknews,To the Stars We Return: Start-Ups Offering Spa...,[],A Houston-based startup will send your cremate...,1
3038,https://sputniknews.com/radio_trendstorm/20170...,sputniknews,China & India: Walking the Himalayan Tightrope,[],China & India: Walking the Himalayan Tightrope...,1


In [12]:
df_merge.authenticity.value_counts()

0    1566
1    1473
Name: authenticity, dtype: int64

## Export data to csv file

In [13]:
path = os.path.join('data','merged_data.csv')
df_merge.to_csv(path)  

In [14]:
A = 1473/1566
A

0.9406130268199234

## Balance data

### sample real news
number of real news ~= number of fake news

In [15]:
data_balanced_realnews = df_merge[df_merge.authenticity==0]
data_balanced_realnews = data_balanced_realnews.sample(frac=0.9406)
data_balanced_realnews.shape

(1473, 6)

### get all fake news

In [16]:
data_balanced_fakenews = df_merge[df_merge.authenticity==1]
data_balanced_fakenews.shape

(1473, 6)

### combine real and fake news

In [17]:
data_balanced = pd.concat([data_balanced_realnews, data_balanced_fakenews], ignore_index=True)
data_balanced.authenticity.value_counts()
data_balanced = data_balanced.sample(frac=1).reset_index(drop=True)

### export data

In [18]:
path = os.path.join('data','balanced_data.csv')
data_balanced.to_csv(path)  

In [19]:
data_balanced.head(n=15)

Unnamed: 0,url,source,title,author,text,authenticity
0,http://politicalo.com/reince-priebus-older-peo...,politicalo,Reince Priebus: “Older People Should Pay 5 Tim...,['Lea Vat Kens'],"Appearing on Fox News Sunday, White House Chie...",1
1,http://www.reuters.com/article/us-usa-trump-sc...,reuters,Factbox: Scaramucci's financial potpourri - ba...,"['Lawrence Delevingne', 'Svea Herbst-Bayliss']",White House Communications Director Anthony Sc...,0
2,http://www.cnn.com/travel/article/vermont-whis...,cnn,Is this the world's best whiskey?,[],"Jen Rose Smith, CNN • Updated 27th July 2017\n...",0
3,http://www.npr.org/2017/07/27/539907467/senate...,npr,"Senate Careens Toward High-Drama, Late Night H...",['Susan Davis'],"Senate Careens Toward High-Drama, Late Night H...",0
4,https://www.interestingdailynews.com/rachel-ma...,interestingdailynews,Rachel Maddow Exposes Mike Pence’s 4 Biggest L...,['Interesting Daily News'],Rachel Maddow demonstrated why Vice President ...,1
5,https://www.interestingdailynews.com/man-rapes...,interestingdailynews,"Man Rapes Infant, Leaves Her With Something To...",['Interesting Daily News'],"Man Rapes Infant, Leaves Her With Something To...",1
6,http://www.cnn.com/2017/07/27/europe/charlie-g...,cnn,Charlie Gard will be moved to hospice for fina...,"['Lauren Said-Moorhouse', 'Richard Allen Greene']","London (CNN) Charlie Gard, the terminally ill ...",0
7,http://www.newyorker.com/news/sporting-scene/k...,newyorker,Kyrie Irving’s Anxiety of Influence,[],"When it was reported, last week, that Kyrie Ir...",0
8,http://www.newyorker.com/magazine/2017/07/31/p...,newyorker,Panorama’s Contemporary Scope,[],Modern music festivals affirm the contemporary...,0
9,http://beforeitsnews.com/sports/2017/07/titans...,beforeitsnews,Titans Ink Former Colts OLB Erik Walden,['Sports Gab Network'],(Before It's News)\n\nThe Titans have agreed t...,1
