# Read and Process all Input Data Files
- Create for loop to read in all Kickstarter json files
- Flatten json "data" field
- Select columns for inclusion in analysis
- Dedupe data

In [1]:
# Import packages

import numpy as np
import pandas as pd
import json
import time
import csv
from pandas.io.json import json_normalize
import os
from flatten_json import flatten

## Test Run on single file prior to looping

In [14]:
# for each file, read it, flatten it, keep select columns

def process_data(filepath):
    
    # Read json file
    df = pd.read_json(filepath, lines = True)
      
    # flatten data field
    dict_flattened = (flatten(record, '.') for record in df['data'])
    df_new = pd.DataFrame(dict_flattened)
    
    # select only needed columns
    df_new = df_new[['id', 'name', 'blurb', 'goal', 'pledged', 'state', 'country', 'currency_symbol', 'deadline', 'state_changed_at', 'created_at', 'launched_at', 'staff_pick', 'backers_count', 'location.id', 'location.name', 'category.id', 'category.name', 'category.slug', 'spotlight', 'urls.web.project']]

    # add run_id from original df
    df_new['run_id'] = df['run_id']
    
    # dedupe df
    df_new = df_new.drop_duplicates()
    
    # drop all non-US, non-completed projects
    df_new = df_new[(df_new['country'] == 'US') & (df_new['state'].isin(['failed', 'successful']))]
    
    # Pickle the dataframe
    fileroot, fileext = os.path.splitext(filepath)
    output_filename = fileroot + '.pkl'
    df_new.to_pickle(output_filename)
#     
#     
#     df_new.to_csv(output_filename, sep='\t')
#     file = open(output_filename, "w")
#     file.write(text)
    
    
#     file.close()

In [15]:
process_data('input_data_files/Kickstarter_2018-08-16T03_20_13_856Z.json')

In [16]:
test = pd.read_pickle('input_data_files/Kickstarter_2018-08-16T03_20_13_856Z.pkl')

In [18]:
test.sample(5)

Unnamed: 0,id,name,blurb,goal,pledged,state,country,currency_symbol,deadline,state_changed_at,...,staff_pick,backers_count,location.id,location.name,category.id,category.name,category.slug,spotlight,urls.web.project,run_id
26469,1658171328,Tasting Room at Man Skirt Brewing,So close we can taste it: Help fund the tastin...,15000.0,25240.0,successful,US,$,1427333651,1427333655,...,False,164,2415906.0,Hackettstown,307,Drinks,food/drinks,True,https://www.kickstarter.com/projects/138169743...,Kickstarter_2018-08-16T03_20_13_856Z
122436,104490583,2ski Brewski,A device that allows double fisting with the u...,50000.0,50.0,failed,US,$,1441415093,1441415094,...,False,1,2459098.0,New Windsor,307,Drinks,food/drinks,False,https://www.kickstarter.com/projects/584875962...,Kickstarter_2018-08-16T03_20_13_856Z
147097,911150977,A New Quarterly on Photography,This new independent web magazine brings toget...,120.0,325.0,successful,US,$,1519851600,1519851600,...,True,27,12589335.0,Brooklyn,49,Periodicals,publishing/periodicals,True,https://www.kickstarter.com/projects/391287848...,Kickstarter_2018-08-16T03_20_13_856Z
40339,186876047,Medal of Victory,A feature comedy about two AWOL soldiers who a...,40000.0,40365.11,successful,US,$,1353463200,1353463233,...,True,293,2459115.0,New York,31,Narrative Film,film & video/narrative film,True,https://www.kickstarter.com/projects/535232476...,Kickstarter_2018-08-16T03_20_13_856Z
70856,1930401874,The Illuminated Path:A Journey of Self Love & ...,"In her upcoming book, Toni Chiapelli explores ...",5000.0,5155.0,successful,US,$,1299306344,1299306347,...,False,51,2453280.0,Monterey,15,Photography,photography,True,https://www.kickstarter.com/projects/35431595/...,Kickstarter_2018-08-16T03_20_13_856Z


## Create a for loop to process and pickle all the data

In [19]:
import glob

# package files into a list for iteration
filelist = glob.glob('input_data_files/Kickstarter_*.json')
filelist

['input_data_files/Kickstarter_2018-12-13T03_20_05_701Z.json',
 'input_data_files/Kickstarter_2019-02-14T03_20_04_734Z.json',
 'input_data_files/Kickstarter_2019-06-13T03_20_35_801Z.json',
 'input_data_files/Kickstarter_2018-09-13T03_20_17_777Z.json',
 'input_data_files/Kickstarter_2019-03-14T03_20_12_200Z.json',
 'input_data_files/Kickstarter_2018-10-18T03_20_48_880Z.json',
 'input_data_files/Kickstarter_2019-01-17T03_20_02_630Z.json',
 'input_data_files/Kickstarter_2018-11-15T03_20_50_568Z.json',
 'input_data_files/Kickstarter_2019-08-15T03_20_03_022Z.json',
 'input_data_files/Kickstarter_2019-04-18T03_20_02_220Z.json',
 'input_data_files/Kickstarter_2019-07-18T03_20_05_009Z.json',
 'input_data_files/Kickstarter_2019-05-16T03_20_20_822Z.json']

In [20]:
for i, filepath in enumerate(filelist):
    process_data(filepath)

## Append the files to create large dataframe

In [None]:
pkllist = glob.glob('input_data_files/Kickstarter_*.pkl')
pkllist

In [None]:
concat_df = pd.DataFrame()

for i, file in enumerate(pkllist):
    temp_df = pd.read_pickle(file)
    concat_df = pd.concat([concat_df, temp_df], axis=1)