# Read and Process all Input Data Files
- Create for loop to read in all Kickstarter json files
- Flatten json "data" field
- Select columns for inclusion in analysis
- Dedupe data

In [1]:
# Import packages

import numpy as np
import pandas as pd
import json
import time
import csv
from pandas.io.json import json_normalize
import os
from flatten_json import flatten

## Test Run on single file prior to looping

In [14]:
# for each file, read it, flatten it, keep select columns

def process_data(filepath):
    
    # Read json file
    df = pd.read_json(filepath, lines = True)
      
    # flatten data field
    dict_flattened = (flatten(record, '.') for record in df['data'])
    df_new = pd.DataFrame(dict_flattened)
    
    # select only needed columns
    df_new = df_new[['id', 'name', 'blurb', 'goal', 'pledged', 'state', 'country', 'currency_symbol', 'deadline', 'state_changed_at', 'created_at', 'launched_at', 'staff_pick', 'backers_count', 'location.id', 'location.name', 'category.id', 'category.name', 'category.slug', 'spotlight', 'urls.web.project']]

    # add run_id from original df
    df_new['run_id'] = df['run_id']
    
    # dedupe df
    df_new = df_new.drop_duplicates()
    
    # drop all non-US, non-completed projects
    df_new = df_new[(df_new['country'] == 'US') & (df_new['state'].isin(['failed', 'successful']))]
    
    # Pickle the dataframe
    fileroot, fileext = os.path.splitext(filepath)
    output_filename = fileroot + '.pkl'
    df_new.to_pickle(output_filename)
#     
#     
#     df_new.to_csv(output_filename, sep='\t')
#     file = open(output_filename, "w")
#     file.write(text)
    
    
#     file.close()

In [15]:
process_data('input_data_files/Kickstarter_2018-08-16T03_20_13_856Z.json')

In [16]:
test = pd.read_pickle('input_data_files/Kickstarter_2018-08-16T03_20_13_856Z.pkl')

In [18]:
test.sample(5)

Unnamed: 0,id,name,blurb,goal,pledged,state,country,currency_symbol,deadline,state_changed_at,...,staff_pick,backers_count,location.id,location.name,category.id,category.name,category.slug,spotlight,urls.web.project,run_id
26469,1658171328,Tasting Room at Man Skirt Brewing,So close we can taste it: Help fund the tastin...,15000.0,25240.0,successful,US,$,1427333651,1427333655,...,False,164,2415906.0,Hackettstown,307,Drinks,food/drinks,True,https://www.kickstarter.com/projects/138169743...,Kickstarter_2018-08-16T03_20_13_856Z
122436,104490583,2ski Brewski,A device that allows double fisting with the u...,50000.0,50.0,failed,US,$,1441415093,1441415094,...,False,1,2459098.0,New Windsor,307,Drinks,food/drinks,False,https://www.kickstarter.com/projects/584875962...,Kickstarter_2018-08-16T03_20_13_856Z
147097,911150977,A New Quarterly on Photography,This new independent web magazine brings toget...,120.0,325.0,successful,US,$,1519851600,1519851600,...,True,27,12589335.0,Brooklyn,49,Periodicals,publishing/periodicals,True,https://www.kickstarter.com/projects/391287848...,Kickstarter_2018-08-16T03_20_13_856Z
40339,186876047,Medal of Victory,A feature comedy about two AWOL soldiers who a...,40000.0,40365.11,successful,US,$,1353463200,1353463233,...,True,293,2459115.0,New York,31,Narrative Film,film & video/narrative film,True,https://www.kickstarter.com/projects/535232476...,Kickstarter_2018-08-16T03_20_13_856Z
70856,1930401874,The Illuminated Path:A Journey of Self Love & ...,"In her upcoming book, Toni Chiapelli explores ...",5000.0,5155.0,successful,US,$,1299306344,1299306347,...,False,51,2453280.0,Monterey,15,Photography,photography,True,https://www.kickstarter.com/projects/35431595/...,Kickstarter_2018-08-16T03_20_13_856Z


## Create a for loop to process and pickle all the data

In [3]:
import glob

In [19]:
# package files into a list for iteration
filelist = glob.glob('input_data_files/Kickstarter_*.json')
filelist

['input_data_files/Kickstarter_2018-12-13T03_20_05_701Z.json',
 'input_data_files/Kickstarter_2019-02-14T03_20_04_734Z.json',
 'input_data_files/Kickstarter_2019-06-13T03_20_35_801Z.json',
 'input_data_files/Kickstarter_2018-09-13T03_20_17_777Z.json',
 'input_data_files/Kickstarter_2019-03-14T03_20_12_200Z.json',
 'input_data_files/Kickstarter_2018-10-18T03_20_48_880Z.json',
 'input_data_files/Kickstarter_2019-01-17T03_20_02_630Z.json',
 'input_data_files/Kickstarter_2018-11-15T03_20_50_568Z.json',
 'input_data_files/Kickstarter_2019-08-15T03_20_03_022Z.json',
 'input_data_files/Kickstarter_2019-04-18T03_20_02_220Z.json',
 'input_data_files/Kickstarter_2019-07-18T03_20_05_009Z.json',
 'input_data_files/Kickstarter_2019-05-16T03_20_20_822Z.json']

In [20]:
for i, filepath in enumerate(filelist):
    process_data(filepath)

## Concat the files to create large dataframe

In [4]:
# Package the list of files for iteration
pkllist = glob.glob('input_data_files/Kickstarter_*.pkl')
pkllist

['input_data_files/Kickstarter_2019-03-14T03_20_12_200Z.pkl',
 'input_data_files/Kickstarter_2019-07-18T03_20_05_009Z.pkl',
 'input_data_files/Kickstarter_2019-04-18T03_20_02_220Z.pkl',
 'input_data_files/Kickstarter_2019-05-16T03_20_20_822Z.pkl',
 'input_data_files/Kickstarter_2019-02-14T03_20_04_734Z.pkl',
 'input_data_files/Kickstarter_2018-10-18T03_20_48_880Z.pkl',
 'input_data_files/Kickstarter_2019-01-17T03_20_02_630Z.pkl',
 'input_data_files/Kickstarter_2018-09-13T03_20_17_777Z.pkl',
 'input_data_files/Kickstarter_2019-08-15T03_20_03_022Z.pkl',
 'input_data_files/Kickstarter_2018-12-13T03_20_05_701Z.pkl',
 'input_data_files/Kickstarter_2019-06-13T03_20_35_801Z.pkl',
 'input_data_files/Kickstarter_2018-11-15T03_20_50_568Z.pkl',
 'input_data_files/Kickstarter_2018-08-16T03_20_13_856Z.pkl']

In [5]:
# Iterate over list of files, concatenating and dropping temp_df
concat_df = pd.DataFrame()

for i, file in enumerate(pkllist):
    temp_df = pd.read_pickle(file)
    concat_df = pd.concat([concat_df, temp_df])

## Inspect data

In [6]:
concat_df.shape

(1668216, 22)

In [27]:
concat_df.sample(10)

Unnamed: 0,id,name,blurb,goal,pledged,state,country,currency_symbol,deadline,state_changed_at,...,staff_pick,backers_count,location.id,location.name,category.id,category.name,category.slug,spotlight,urls.web.project,run_id
91459,1700356381,Soothsayer,Soothsayer is a family owned and operated rest...,20000.0,20755.0,successful,US,$,1447864346,1447864346,...,True,121,12589342.0,Manhattan,312,Restaurants,food/restaurants,True,https://www.kickstarter.com/projects/240021178...,Kickstarter_2019-08-15T03_20_03_022Z
63625,1457203604,A. Fisher Brewing Company Revived,"Est. 1884 before closing 1967, A Fisher Brewin...",7500.0,20427.0,successful,US,$,1485932340,1485932342,...,False,229,2487610.0,Salt Lake City,307,Drinks,food/drinks,True,https://www.kickstarter.com/projects/984672884...,Kickstarter_2018-09-13T03_20_17_777Z
166823,816898077,IF THIS GOES ON - Political SF Anthology,A speculative anthology of thirty short storie...,10000.0,12369.8,successful,US,$,1530276589,1530276589,...,True,372,2420394.0,Herndon,324,Anthologies,publishing/anthologies,True,https://www.kickstarter.com/projects/102233767...,Kickstarter_2019-04-18T03_20_02_220Z
88543,1509154758,Let's Turn Plastic into Dinosaurs!,"This Summer Solstice, Explore Ecology staff an...",1500.0,60.0,failed,US,$,1371790800,1371790836,...,False,2,2488828.0,Santa Barbara,53,Public Art,art/public art,False,https://www.kickstarter.com/projects/135036316...,Kickstarter_2018-10-18T03_20_48_880Z
174291,900009369,SUPPORT AND SAVE NORTH END STUDIOS,SAVE North End Studios\nHELP US PURCHASE THE B...,500000.0,55.0,failed,US,$,1321467161,1321467163,...,False,3,2391585.0,Detroit,53,Public Art,art/public art,False,https://www.kickstarter.com/projects/113010464...,Kickstarter_2018-11-15T03_20_50_568Z
47989,445001519,"New Works, New Dancers, New Perspectives!",We aim to give new and emerging choreographers...,1300.0,1550.0,successful,US,$,1362074340,1362074340,...,False,23,2379574.0,Chicago,6,Dance,dance,True,https://www.kickstarter.com/projects/367488216...,Kickstarter_2019-04-18T03_20_02_220Z
161309,189208618,Honeybee V2 Playing Cards by Penguin Magic,"A beautiful update to the original, sold-out H...",12000.0,13285.0,successful,US,$,1489679962,1489679962,...,False,396,2478521.0,Rancho Cordova,273,Playing Cards,games/playing cards,True,https://www.kickstarter.com/projects/penguinma...,Kickstarter_2018-11-15T03_20_50_568Z
104338,518656702,"Tight Loops presents: ""Big Land""",An expedition film into the heart of Labradors...,15000.0,28839.0,successful,US,$,1531886720,1531886720,...,True,355,2460595.0,North Attleboro,30,Documentary,film & video/documentary,True,https://www.kickstarter.com/projects/tightloop...,Kickstarter_2019-01-17T03_20_02_630Z
109430,458891861,GIRLY: If Online Poker Players Were A Bunch Of...,The pilot episode for a webseries about 4 girl...,5000.0,1686.0,failed,US,$,1304057893,1304057897,...,False,16,3534.0,Montreal,33,Webseries,film & video/webseries,False,https://www.kickstarter.com/projects/235275055...,Kickstarter_2018-08-16T03_20_13_856Z
163318,1296631204,Estill Masonry Artworks,We need help publishing 150-200 pages Picture ...,10000.0,0.0,failed,US,$,1428079225,1428079228,...,False,0,2442327.0,Louisville,54,Mixed Media,art/mixed media,False,https://www.kickstarter.com/projects/177479097...,Kickstarter_2018-09-13T03_20_17_777Z


In [28]:
concat_df = concat_df.drop_duplicates()

In [29]:
concat_df.shape

(1668216, 22)

In [30]:
concat_df.isnull().sum()

id                     0
name                   0
blurb                  0
goal                   0
pledged                0
state                  0
country                0
currency_symbol        0
deadline               0
state_changed_at       0
created_at             0
launched_at            0
staff_pick             0
backers_count          0
location.id         5979
location.name       5979
category.id            0
category.name          0
category.slug          0
spotlight              0
urls.web.project       0
run_id                 0
dtype: int64

In [7]:
concat_df.to_pickle('concat_df.pkl')

In [9]:
# Import package to convert time from unix
from datetime import datetime

In [11]:
# Convert time 
utc_deadline = datetime.utcfromtimestamp(concat_df['deadline'])

TypeError: cannot convert the series to <class 'int'>