# Data Loading Scripts | Loads yaml files into one Dataframe

In [22]:
import pandas as pd
import yaml
import os
import sys
from hurry.filesize import size
from joblib import Parallel, delayed

In [2]:
def yaml_to_df(file_path):
    '''
    Reads the yaml data file and returns it as a Dataframe object.
    '''
    with open(file_path, "r", encoding = "ISO-8859-1") as stream:
        return pd.DataFrame(yaml.load_all(stream))

In [26]:
# Folder path of the yaml data files
folder = r"data" 

# Getting absolute paths of the yaml data files
files = [os.path.join(folder,f) for f in os.listdir(folder) if '.yaml' in f] 
print("Total files to process %s"%(len(files)))

# Incase if you cant process all the files i.e. Low Memory. You can still explorer a subset of data.
files_to_process = len(files) # change this to the number of files you need to process
files = files[:file_to_process]

Total files to process 150


### Parrallel data loading. 

In [3]:
# Assign the num of parallel processes you want to intiate 
# Note - it can be one less than the total logical processors you have in you machine
Num_Cores = 19 
data_df_list = Parallel(n_jobs=Num_Cores)(delayed(yaml_to_df)(file) for file in files)

# Concatinate all the dataframes.
data_df = pd.concat(data_df_list, ignore_index=True)

Total files to process 150




In [16]:
data_df.head()

Unnamed: 0,authors,date,globalID,heading,language,publisher,summary,tags,text,url
0,[The Newspaper's Staff Reporter],29-06-2014,1360000,Court orders IO’s arrest in arson attack case,en,Dawn News,,[Court orders IO’s arrest in arson attack case...,KARACHI: An anti-terrorism court on Saturday d...,http://www.dawn.com/news/1115837
1,[Reuters],29-06-2014,1360001,Brilliant Rodriguez leads Colombia into last e...,en,Dawn News,,[Brilliant Rodriguez leads Colombia into last ...,RIO JANEIRO: James Rodriguez scored a contende...,http://www.dawn.com/news/1115838
2,[The Newspaper's Staff Reporter],29-06-2014,1360002,ICU at Valika Hospital inaugurated,en,Dawn News,,"[ICU at Valika Hospital inaugurated , Newspap...","KARACHI: An 11-bed intensive care unit, named ...",http://www.dawn.com/news/1115839
3,[Kashif Abbasi],29-06-2014,1360003,Peace in Fata linked to abolition of FCR,en,Dawn News,,"[Peace in Fata linked to abolition of FCR , ...",ISLAMABAD: For bringing prosperity and restori...,http://www.dawn.com/news/1115840
4,[Tariq Naqash],29-06-2014,1360004,PPAJK awards ticket to ex-general’s spouse,en,Dawn News,,"[PPAJK awards ticket to ex, general’s spouse ,...",MUZAFFARABAD: The ruling People’s Party in Aza...,http://www.dawn.com/news/1115841


In [25]:
# Memory Usage by the dataframe
size(data_df.memory_usage(index=True, deep=True).sum())

'9G'

In [14]:
# Saving dataframe as pickle 
data_df.to_pickle("data.pkl")

## Quick Way : Load the data file 

In [32]:
# Loading dataframe as pickle 
data_new = pd.read_pickle("data.pkl")