# Initial data cleaning

In [1]:
import numpy as np
import pandas as pd
import json

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Loading dataset and dropping the first column, because it contains rownumbers
df = pd.read_csv('data/Kickstarter_merged.csv')
df = df.drop(columns='Unnamed: 0', axis=1) 

In [3]:
# Print the column names
df.columns

Index(['backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'created_at', 'creator', 'currency', 'currency_symbol',
       'currency_trailing_code', 'current_currency', 'deadline',
       'disable_communication', 'friends', 'fx_rate', 'goal', 'id',
       'is_backing', 'is_starrable', 'is_starred', 'launched_at', 'location',
       'name', 'permissions', 'photo', 'pledged', 'profile', 'slug',
       'source_url', 'spotlight', 'staff_pick', 'state', 'state_changed_at',
       'static_usd_rate', 'urls', 'usd_pledged', 'usd_type'],
      dtype='object')

In [4]:
# look at the first 10 rows of the columns 0 to 10 to find odd values
#df.iloc[0:10, 0:11]

Columns 1 to 10: 'backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'created_at', 'creator', 'currency', 'currency_symbol',
       'currency_trailing_code', 'current_currency'
* There are too many values in the columns category and creator.
* The column created_at has values that do not look like time data.

In [5]:
df.category[0]

'{"id":266,"name":"Footwear","slug":"fashion/footwear","position":5,"parent_id":9,"color":16752598,"urls":{"web":{"discover":"http://www.kickstarter.com/discover/categories/fashion/footwear"}}}'

In [6]:
# Get name out of JSON formatted dictionary
import json
n = json.loads(df.category[0])
print(n.get("name"))

Footwear


In [None]:
# Extract the category name and make a new column with the category name
df["category_name"] = ""
for i in range(len(df.category)):
    try:
        dict_cat = json.loads(df.category[i])
        df.category_name[i] = dict_cat.get("name")
    except:
        df.category_name[i] = NaN

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.category_name[i] = dict_cat.get("name")


In [None]:
df.creator[0]

In [None]:
# Get name out of json formatted library
import json
m = json.loads(df.creator[0])
print(m.get("name"))

In [None]:
# Extract the creator name and make a new column with the creator name
df["creator_name"] = ""
for j in range(len(df.creator)):
    try:
        dict_cre = json.loads(df.creator[j])
        df.creator_name[j] = dict_cre.get("name")
    except:
        df.creator_name[j] = 'NaN'

In [None]:
# look at the first 10 rows of the columns 11 to 20 to find odd values
df.iloc[0:10, 11:21]

Columns 11 to 20: 'deadline',
       'disable_communication', 'friends', 'fx_rate', 'goal', 'id',
       'is_backing', 'is_starrable', 'is_starred', 'launched_at'
* There are many NaN in the columns friends, is_backing and is_starred.
* The columns deadline and launched_at have values that do not look like time data.

In [None]:
# look at the first 10 rows of the columns 21 to 30 to find odd values
df.iloc[0:10, 21:31]

Columns 21 to 30: 'location', 'name', 'permissions', 'photo', 'pledged', 'profile', 'slug','source_url', 'spotlight', 'staff_pick'
* There are too many values in the columns location and profile.
* The column photo is a dictionary.
* There are many NaN in the column permissions.

In [None]:
df.location[0]

In [None]:
# Extract the location name and make a new column with the location name
df["location_name"] = ""
for k in range(len(df.location)):
    try:
        dict_loc = json.loads(df.location[k])
        df.location_name[k] = dict_loc.get("name")
    except:
        df.location_name[k] = 'NaN'

In [None]:
df.profile[0]

In [None]:
# Extract the profile name and make a new column with the profile name
df["profile_name"] = ""
for l in range(len(df.profile)):
    try:
        dict_cre = json.loads(df.profile[l])
        df.profile_name[l] = dict_cre.get("name")
    except:
        df.profile_name[l] = 'NaN'

In [None]:
# look at the first 10 rows of the columns 31 to 36 to find odd values
df.iloc[0:10, 31:37]

Columns 31 to 37: 'state', 'state_changed_at', 'static_usd_rate', 'urls', 'usd_pledged', 'usd_type'
* The column state_changed_at has values that do not look like time data.

In [None]:
# delete columns category, creator, location and profile
df.drop(['category','creator','location','profile'], axis=1, inplace=True)

In [None]:
df.shape