# Merge data
* Data format is following Hyesoo's format
* All data is exported to a csv file

In [3]:
import pandas as pd
import os

## Import Jinmei's data

### import fake news

In [4]:
path = os.path.join('data', 'fakenews_jz.csv')
df_fakenews = pd.read_csv(path,usecols=[1,2,3,4,5])
df_fakenews['authenticity'] = 0

### import real news

In [5]:
path = os.path.join('data', 'realnews_jz.csv')
df_realnews = pd.read_csv(path,usecols=[0,1,2,3,4])
df_realnews['authenticity'] = 1

### merge fake and real news

In [6]:
df_jinmei = pd.concat([df_fakenews, df_realnews], ignore_index=True)
df_jinmei.tail()

Unnamed: 0,url,source,title,author,text,authenticity
1271,http://www.newyorker.com/news/news-desk,newyorker,"News Desk: Breaking News, Reporting, and Polit...",[],,1
1272,http://www.newyorker.com/cartoon/dernavich-201...,newyorker,A Cartoon from The New Yorker,[],,1
1273,http://tunein.com/radio/New-Yorker-Poetry-p803...,newyorker,New Yorker Poetry,[],Yusef Komunyakaa reads a poem by Marilyn Hacke...,1
1274,http://video.newyorker.com/watch/shorts-murmur...,newyorker,The Startup to End All Startups,[],"The Startup to End All Startups\n\nMeet uBox, ...",1
1275,http://www.newyorker.com/humor/borowitz-report...,newyorker,Cruz: “The Dream of Keeping Poor People from S...,[],WASHINGTON ( The Borowitz Report )—Acknowledgi...,1


In [5]:
df_jinmei.shape

(1276, 6)

### clean data

In [7]:
# drop rows where text contains NaN
df_jinmei = df_jinmei.dropna(axis=0,subset=['text'], how='any') 
df_jinmei.tail()

Unnamed: 0,url,source,title,author,text,authenticity
1268,http://www.newyorker.com/humor/borowitz-report...,newyorker,Stephen Hawking Angers Trump Supporters with B...,[],LONDON ( The Borowitz Report )—The theoretical...,1
1269,http://tunein.com/radio/New-Yorker-Radio-Hour-...,newyorker,The New Yorker Radio Hour,[],Description:\n\nDavid Remnick is joined by The...,1
1273,http://tunein.com/radio/New-Yorker-Poetry-p803...,newyorker,New Yorker Poetry,[],Yusef Komunyakaa reads a poem by Marilyn Hacke...,1
1274,http://video.newyorker.com/watch/shorts-murmur...,newyorker,The Startup to End All Startups,[],"The Startup to End All Startups\n\nMeet uBox, ...",1
1275,http://www.newyorker.com/humor/borowitz-report...,newyorker,Cruz: “The Dream of Keeping Poor People from S...,[],WASHINGTON ( The Borowitz Report )—Acknowledgi...,1


In [7]:
df_jinmei.shape

(1213, 6)

In [8]:
# drop rows where the length of characters in text is less than 500
df_jinmei = df_jinmei[df_jinmei.text.map(len) > 500]
df_jinmei.shape

(822, 6)

## Import Hyesoo's data

In [9]:
path = os.path.join('data', 'hyesoo_df.csv')
df_hyesoo = pd.read_csv(path,usecols=[1,2,3,4,5,6]) 
df_hyesoo.tail()

Unnamed: 0,url,source,title,author,text,authenticity
1736,https://www.ice.gov/news/releases/operation-ma...,politico,Operation Matador nets 39 MS-13 arrests in las...,[],NEW YORK – U.S. Immigration and Customs Enforc...,1
1737,http://www.politico.com/story/2017/07/27/obama...,politico,Senate Republicans prepare to pass Obamacare r...,"['John Bresnahan', 'Burgess Everett', 'Jennife...",Senate Republicans are closing in on passage o...,1
1738,https://www.nytimes.com/2017/07/26/technology/...,nytimes,Google’s New Parental Control App Has a Flaw: ...,"['Brian X. Chen', 'Tech Fix']",“The fact that the kid can graduate themselves...,1
1739,http://www.foxnews.com/entertainment/2017/07/2...,foxnews,Hulu resurrects TGIF lineup with acquisition o...,['Tyler Mccarthy'],Hulu is hoping to make itself the go-to stream...,1
1740,http://www.npr.org/2017/07/27/539559582/5-unan...,npr,5 Unanswered Questions About Trump's 'Ban' On ...,['Philip Ewing'],5 Unanswered Questions About Trump's 'Ban' On ...,1


## Merge all data

In [10]:
df_merge = pd.concat([df_jinmei, df_hyesoo], ignore_index=True)
df_merge.tail()

Unnamed: 0,url,source,title,author,text,authenticity
2558,https://www.ice.gov/news/releases/operation-ma...,politico,Operation Matador nets 39 MS-13 arrests in las...,[],NEW YORK – U.S. Immigration and Customs Enforc...,1
2559,http://www.politico.com/story/2017/07/27/obama...,politico,Senate Republicans prepare to pass Obamacare r...,"['John Bresnahan', 'Burgess Everett', 'Jennife...",Senate Republicans are closing in on passage o...,1
2560,https://www.nytimes.com/2017/07/26/technology/...,nytimes,Google’s New Parental Control App Has a Flaw: ...,"['Brian X. Chen', 'Tech Fix']",“The fact that the kid can graduate themselves...,1
2561,http://www.foxnews.com/entertainment/2017/07/2...,foxnews,Hulu resurrects TGIF lineup with acquisition o...,['Tyler Mccarthy'],Hulu is hoping to make itself the go-to stream...,1
2562,http://www.npr.org/2017/07/27/539559582/5-unan...,npr,5 Unanswered Questions About Trump's 'Ban' On ...,['Philip Ewing'],5 Unanswered Questions About Trump's 'Ban' On ...,1


In [11]:
df_merge.shape

(2563, 6)

## Export data to csv file

In [12]:
path = os.path.join('data','merged_data.csv')
df_merge.to_csv(path)  

In [12]:
# test
#df_test = pd.read_csv(path) 
#df_test.tail()

## Balance data

In [11]:
df_merge['label_num'] = df_merge.authenticity.map({1:0,0:1})
df_merge.tail()

Unnamed: 0,url,source,title,author,text,authenticity,label_num
2558,https://www.ice.gov/news/releases/operation-ma...,politico,Operation Matador nets 39 MS-13 arrests in las...,[],NEW YORK – U.S. Immigration and Customs Enforc...,1,0
2559,http://www.politico.com/story/2017/07/27/obama...,politico,Senate Republicans prepare to pass Obamacare r...,"['John Bresnahan', 'Burgess Everett', 'Jennife...",Senate Republicans are closing in on passage o...,1,0
2560,https://www.nytimes.com/2017/07/26/technology/...,nytimes,Google’s New Parental Control App Has a Flaw: ...,"['Brian X. Chen', 'Tech Fix']",“The fact that the kid can graduate themselves...,1,0
2561,http://www.foxnews.com/entertainment/2017/07/2...,foxnews,Hulu resurrects TGIF lineup with acquisition o...,['Tyler Mccarthy'],Hulu is hoping to make itself the go-to stream...,1,0
2562,http://www.npr.org/2017/07/27/539559582/5-unan...,npr,5 Unanswered Questions About Trump's 'Ban' On ...,['Philip Ewing'],5 Unanswered Questions About Trump's 'Ban' On ...,1,0


### sample real news
number of real news ~= number of fake news

In [13]:
data_balanced_realnews = df_merge[df_merge.label_num==0]
data_balanced_realnews = data_balanced_realnews.sample(frac=0.64)
data_balanced_realnews.shape

(1002, 7)

### get all fake news

In [14]:
data_balanced_fakenews = df_merge[df_merge.label_num==1]
data_balanced_fakenews.shape

(997, 7)

### combine real and fake news

In [16]:
data_balanced = pd.concat([data_balanced_realnews, data_balanced_fakenews], ignore_index=True)
data_balanced.label_num.value_counts()

0    1002
1     997
Name: label_num, dtype: int64

### export data

In [17]:
path = os.path.join('data','balanced_data.csv')
data_balanced.to_csv(path)  