In [1]:
import os, json
import pandas as pd

##  Function as a generator to load all files in all sub-folders under the parent directory 

In [2]:
def list_files(dirpath):
    for dirname, dirnames, filenames in os.walk(dirpath):
        for filename in filenames:
            yield os.path.join(dirname, filename)

## Load all .json files use json.loads(), basically each .json file only contains 1 line

In [3]:
%%time
json_list = []
dirpath = 'us-financial-news-articles/'
for filePath in list_files(dirpath):
    if filePath.endswith('.json'):
        with open(filePath) as f:
            for line in f:
                data = json.loads(line)
                json_list.append([data['published'],
                                  data['thread']['site'],
                                  data['title'], 
                                  data['text'],
                                  data['url']])
    

CPU times: user 42.8 s, sys: 36.4 s, total: 1min 19s
Wall time: 2min 41s


## Make sure the length of json list matches the total files

In [4]:
len(json_list)

306242

## Convert Json to DataFrame in order to perform data analysis

In [5]:
col_names =  ['published_date','source_name','title','body','url']
df= pd.DataFrame(json_list,columns=col_names)

In [8]:
df.shape

(306242, 5)

In [9]:
df.head()

Unnamed: 0,published_date,source_name,title,body,url
0,2018-03-27T22:14:00.000+03:00,reuters.com,BRIEF-AU Optronics to pay cash dividend of T$1...,March 27(Reuters) - AU Optronics Corp :\n* Say...,https://www.reuters.com/article/brief-au-optro...
1,2018-03-08T15:42:00.000+02:00,reuters.com,British stats watchdog - stop using RPI inflat...,"March 8, 2018 / 1:35 PM / Updated an hour ago ...",https://uk.reuters.com/article/uk-britain-econ...
2,2018-03-23T22:23:00.000+03:00,reuters.com,Dropbox shares surge in IPO,"Dropbox shares surge in IPO Saturday, March 24...",https://in.reuters.com/video/2018/03/23/dropbo...
3,2018-03-13T01:05:00.000+02:00,reuters.com,Bookkeeper of Auschwitz dies before starting s...,BERLIN (Reuters) - The man known as the “bookk...,https://www.reuters.com/article/us-germany-naz...
4,2018-03-05T13:27:00.000+02:00,cnbc.com,US stocks set for a negative open as trade war...,Dow closes 336 points higher as trade-war worr...,https://www.cnbc.com/2018/03/05/us-stock-futur...


## Sort the data by date

In [10]:
df = df.sort_values(by=['published_date'], ascending=True)


In [11]:
df.head()

Unnamed: 0,published_date,source_name,title,body,url
209579,2017-12-07T06:58:00.000+02:00,wsj.com,This entrepreneur is ringing up sales restorin...,"Published: Dec 7, 2017 4:58 a.m. ET Share \nCo...",https://www.wsj.com/articles/an-entrepreneur-b...
205030,2017-12-07T22:36:00.000+02:00,cnbc.com,Mexican official disputes reports of tainted a...,Mexico's secretary of tourism disputed reports...,https://www.cnbc.com/2017/12/07/mexican-offici...
238985,2017-12-07T22:45:00.000+02:00,cnbc.com,Saudi prince has history of extravagant impuls...,Timothy A. Clary | AFP | Getty Images Christie...,https://www.cnbc.com/2017/12/07/saudi-prince-h...
186087,2017-12-08T02:00:00.000+02:00,fortune.com,Golden Globes Predictions for Netflix's The Cr...,By Tom Huddleston Jr. 10:44 AM EST \nOn Sunday...,http://fortune.com/2018/01/05/golden-globes-20...
242189,2017-12-08T02:00:00.000+02:00,fortune.com,Bitcoin: Peter Thiel's Founders Fund Goes Big ...,7:54 PM EST \nPeter Thiel and his venture capi...,http://fortune.com/2018/01/02/bitcoin-peter-th...


In [12]:
df=df.reset_index(inplace=False)
del df['index']

In [17]:
df.head()

Unnamed: 0,published_date,source_name,title,body,url
0,2017-12-07T06:58:00.000+02:00,wsj.com,This entrepreneur is ringing up sales restorin...,"Published: Dec 7, 2017 4:58 a.m. ET Share \nCo...",https://www.wsj.com/articles/an-entrepreneur-b...
1,2017-12-07T22:36:00.000+02:00,cnbc.com,Mexican official disputes reports of tainted a...,Mexico's secretary of tourism disputed reports...,https://www.cnbc.com/2017/12/07/mexican-offici...
2,2017-12-07T22:45:00.000+02:00,cnbc.com,Saudi prince has history of extravagant impuls...,Timothy A. Clary | AFP | Getty Images Christie...,https://www.cnbc.com/2017/12/07/saudi-prince-h...
3,2017-12-08T02:00:00.000+02:00,fortune.com,Golden Globes Predictions for Netflix's The Cr...,By Tom Huddleston Jr. 10:44 AM EST \nOn Sunday...,http://fortune.com/2018/01/05/golden-globes-20...
4,2017-12-08T02:00:00.000+02:00,fortune.com,Bitcoin: Peter Thiel's Founders Fund Goes Big ...,7:54 PM EST \nPeter Thiel and his venture capi...,http://fortune.com/2018/01/02/bitcoin-peter-th...


## Export to .csv file

In [18]:
df.to_csv('us_financial_news_articles_2018.csv')