# Initial data cleaning

In [1]:
import numpy as np
import pandas as pd
import json

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Loading dataset and dropping the first column, because it contains rownumbers
df = pd.read_csv('data/Kickstarter_merged.csv')
df = df.drop(columns='Unnamed: 0', axis=1) 

In [3]:
# Print the column names
df.columns

Index(['backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'created_at', 'creator', 'currency', 'currency_symbol',
       'currency_trailing_code', 'current_currency', 'deadline',
       'disable_communication', 'friends', 'fx_rate', 'goal', 'id',
       'is_backing', 'is_starrable', 'is_starred', 'launched_at', 'location',
       'name', 'permissions', 'photo', 'pledged', 'profile', 'slug',
       'source_url', 'spotlight', 'staff_pick', 'state', 'state_changed_at',
       'static_usd_rate', 'urls', 'usd_pledged', 'usd_type'],
      dtype='object')

In [4]:
# look at the first 10 rows of the columns 0 to 10 to find odd values
#df.iloc[0:10, 0:11]

Columns 1 to 10: 'backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'created_at', 'creator', 'currency', 'currency_symbol',
       'currency_trailing_code', 'current_currency'
* There are too many values in the columns category and creator.
* The column created_at has values that do not look like time data.

In [23]:
df.category[0]

'{"id":266,"name":"Footwear","slug":"fashion/footwear","position":5,"parent_id":9,"color":16752598,"urls":{"web":{"discover":"http://www.kickstarter.com/discover/categories/fashion/footwear"}}}'

In [6]:
# Get name out of JSON formatted dictionary
import json
n = json.loads(df.category[0])
print(n.get("name"))

Footwear


In [13]:
# Extract the category name and make a new column with the category name
df["category_name"] = ""
for i in range(len(df.category)):
    try:
        dict_cat = json.loads(df.category[i])
        df.category_name[i] = dict_cat.get("name")
    except:
        df.category_name[i] = NaN

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.category_name[i] = dict_cat.get("name")


In [22]:
df.creator[0]

'{"id":2094277840,"name":"Lucy Conroy","slug":"babalus","is_registered":null,"chosen_currency":null,"avatar":{"thumb":"https://ksr-ugc.imgix.net/assets/023/784/556/6ed11b25c853ec1aef7f4360d0eb59ef_original.jpg?ixlib=rb-1.1.0&w=40&h=40&fit=crop&v=1548222691&auto=format&frame=1&q=92&s=b64463d8ae6195f7aeb62393e2ca2dde","small":"https://ksr-ugc.imgix.net/assets/023/784/556/6ed11b25c853ec1aef7f4360d0eb59ef_original.jpg?ixlib=rb-1.1.0&w=160&h=160&fit=crop&v=1548222691&auto=format&frame=1&q=92&s=00bc518b23a932bd76fb6e21f4eb6834","medium":"https://ksr-ugc.imgix.net/assets/023/784/556/6ed11b25c853ec1aef7f4360d0eb59ef_original.jpg?ixlib=rb-1.1.0&w=160&h=160&fit=crop&v=1548222691&auto=format&frame=1&q=92&s=00bc518b23a932bd76fb6e21f4eb6834"},"urls":{"web":{"user":"https://www.kickstarter.com/profile/babalus"},"api":{"user":"https://api.kickstarter.com/v1/users/2094277840?signature=1552621545.c7a32fed985a78dec253fe61c1acb7a99edbc0af"}}}'

In [8]:
# Get name out of json formatted library
import json
m = json.loads(df.creator[0])
print(m.get("name"))

Lucy Conroy


In [11]:
# Extract the creator name and make a new column with the creator name
df["creator_name"] = ""
for j in range(len(df.creator)):
    try:
        dict_cre = json.loads(df.creator[j])
        df.creator_name[j] = dict_cre.get("name")
    except:
        df.creator_name[j] = 'NaN'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.creator_name[j] = dict_cre.get("name")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.creator_name[j] = 'NaN'


In [None]:
# look at the first 10 rows of the columns 11 to 20 to find odd values
df.iloc[0:10, 11:21]

Columns 11 to 20: 'deadline',
       'disable_communication', 'friends', 'fx_rate', 'goal', 'id',
       'is_backing', 'is_starrable', 'is_starred', 'launched_at'
* There are many NaN in the columns friends, is_backing and is_starred.
* The columns deadline and launched_at have values that do not look like time data.

In [None]:
# look at the first 10 rows of the columns 21 to 30 to find odd values
df.iloc[0:10, 21:31]

Columns 21 to 30: 'location', 'name', 'permissions', 'photo', 'pledged', 'profile', 'slug','source_url', 'spotlight', 'staff_pick'
* There are too many values in the columns location and profile.
* The column photo is a dictionary.
* There are many NaN in the column permissions.

In [12]:
df.location[0]

'{"id":2462429,"name":"Novato","slug":"novato-ca","short_name":"Novato, CA","displayable_name":"Novato, CA","localized_name":"Novato","country":"US","state":"CA","type":"Town","is_root":false,"urls":{"web":{"discover":"https://www.kickstarter.com/discover/places/novato-ca","location":"https://www.kickstarter.com/locations/novato-ca"},"api":{"nearby_projects":"https://api.kickstarter.com/v1/discover?signature=1552595066.49b64db66a5124f5831752d055cd09aff20cc652&woe_id=2462429"}}}'

In [14]:
# Extract the location name and make a new column with the location name
df["location_name"] = ""
for k in range(len(df.location)):
    try:
        dict_loc = json.loads(df.location[k])
        df.location_name[k] = dict_loc.get("name")
    except:
        df.location_name[k] = 'NaN'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.location_name[k] = dict_loc.get("name")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.location_name[k] = 'NaN'


In [19]:
df.profile[0]

'{"id":3508024,"project_id":3508024,"state":"inactive","state_changed_at":1541459205,"name":null,"blurb":null,"background_color":null,"text_color":null,"link_background_color":null,"link_text_color":null,"link_text":null,"link_url":null,"show_feature_image":false,"background_image_opacity":0.8,"should_show_feature_image_section":true,"feature_image_attributes":{"image_urls":{"default":"https://ksr-ugc.imgix.net/assets/023/667/205/a565fde5382d6b53276597bcbf505af7_original.jpg?ixlib=rb-1.1.0&crop=faces&w=1552&h=873&fit=crop&v=1546238810&auto=format&frame=1&q=92&s=4faccb2ba6fae37a2d990e8471669753","baseball_card":"https://ksr-ugc.imgix.net/assets/023/667/205/a565fde5382d6b53276597bcbf505af7_original.jpg?ixlib=rb-1.1.0&crop=faces&w=560&h=315&fit=crop&v=1546238810&auto=format&frame=1&q=92&s=53798a47ff4e37129dfd4d11827fa5c4"}}}'

In [24]:
# Extract the profile name and make a new column with the profile name
df["profile_name"] = ""
for l in range(len(df.profile)):
    try:
        dict_cre = json.loads(df.profile[l])
        df.profile_name[l] = dict_cre.get("name")
    except:
        df.profile_name[l] = 'NaN'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.profile_name[l] = dict_cre.get("name")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.profile_name[l] = 'NaN'


In [27]:
# look at the first 10 rows of the columns 31 to 36 to find odd values
df.iloc[0:10, 31:37]

Unnamed: 0,usd_pledged,usd_type,category_name,creator_name,location_name,profile_name
0,28645.0,international,Footwear,Lucy Conroy,Novato,
1,1950.0,domestic,Playing Cards,Lisa Vollrath,Euless,The Ofrenda Oracle Deck
2,22404.0,international,Rock,Electra,Hollywood,
3,165.384934,domestic,Playing Cards,Artur Ordijanc (deleted),Kaunas,
4,2820.0,domestic,Nonfiction,Dawn Johnston,Traverse City,
5,3725.0,domestic,Classical Music,Annapolis Chamber Players,Annapolis,
6,3890.0,domestic,Classical Music,The Tekalli Duo,New Haven,
7,660.0,international,Music,funktoast,Kaysville,
8,529.786729,international,Immersive,Overflow Theatre Company,Northampton,
9,2516.160602,international,Accessories,Lauren Ackerley,Wolverhampton,


Columns 31 to 37: 'state', 'state_changed_at', 'static_usd_rate', 'urls', 'usd_pledged', 'usd_type'
* The column state_changed_at has values that do not look like time data.

In [None]:
# delete columns category, creator, location and profile
df.drop(['category','creator','location','profile'], axis=1, inplace=True)