In [1]:
import numpy as np
import pandas as pd
import time

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Loading dataset and dropping the first column, because it contains rownumbers
df_init = pd.read_csv('data/Kickstarter_init_cleaned.csv')
df_init2 = df_init.drop(columns='Unnamed: 0')

In [3]:
# Display numbers with two digits
pd.options.display.float_format = '{:,.2f}'.format

### Selecting data

In [4]:
# Counting unique values in column state.
df_init2.state.value_counts()

successful    117465
failed         75199
canceled        8624
live            7311
suspended        623
Name: state, dtype: int64

In [5]:
# Drop the states canceled and suspended
df_st = df_init2.drop(df_init2[(df_init2.state == "canceled") | (df_init2.state == "suspended")| (df_init2.state == "live")].index)

In [6]:
# Counting unique values in column state.
df_st.state.value_counts()

successful    117465
failed         75199
Name: state, dtype: int64

In [7]:
# Checking column names
df_st.columns

Index(['backers_count', 'blurb', 'converted_pledged_amount', 'country',
       'created_at', 'currency', 'currency_trailing_code', 'current_currency',
       'deadline', 'disable_communication', 'fx_rate', 'goal', 'id',
       'is_starrable', 'launched_at', 'name', 'pledged', 'slug', 'spotlight',
       'staff_pick', 'state', 'state_changed_at', 'static_usd_rate',
       'usd_pledged', 'usd_type', 'category_name', 'creator_name',
       'location_name', 'location_state', 'created_at_rd', 'deadline_rd',
       'launched_at_rd', 'state_changed_at_rd'],
      dtype='object')

In [8]:
# Drop useless columns
df_sel = df_st.drop(columns=['currency_trailing_code', 'current_currency', 'disable_communication',
                              'is_starrable','spotlight','staff_pick', 'static_usd_rate', 'usd_type'])

### Removing doubles based on id

In [9]:
# Check for duplicates in the database based on id
print(df_sel.shape[0]-df_sel.id.nunique())
double = df_sel[df_sel.duplicated(subset=['id'], keep=False)]
#double.sort_values("id")

23685


In [10]:
# Remove duplicates in the database based on id
df_id = df_sel.drop_duplicates(subset=['id'], keep="last")
#df_id[df_id.id==39036]
df_id.shape

(168979, 25)

### Removing columns with comparable content regarding the amount pledged

In [11]:
# Check the difference between converted_pledged_amount, pledged and usd_pledged
#pledged = df_id[["converted_pledged_amount","pledged","usd_pledged"]]
#pledged.describe()
#pledged.head(20)

Converted_pledged_amount is rounded, usd_pledged is not. Pledged is in the original currency. We only keep usd_pledged.

In [12]:
df_mis = df_id.drop(columns=["converted_pledged_amount", "pledged"])

### Check for missing data.

In [13]:
#df_mis.info()

In [14]:
df1 = df_mis[df_mis.isna().any(axis=1)]
df1.shape

(745, 23)

There are 745 missing data values in the columns blurb, creator_name, location_name, and location_state.

In [15]:
# check for missing values in the blurb column
df_mis[df_mis['blurb'].isnull()]

Unnamed: 0,backers_count,blurb,country,created_at,currency,deadline,fx_rate,goal,id,launched_at,...,state_changed_at,usd_pledged,category_name,creator_name,location_name,location_state,created_at_rd,deadline_rd,launched_at_rd,state_changed_at_rd
65168,39,,DE,1504364375,EUR,1507625188,1.13,15000.0,937524480,1505033188,...,1507625190,8873.67,Ready-to-wear,Annabelle Deisler,Munich,Bavaria,Sat Sep 2 16:59:35 2017,Tue Oct 10 10:46:28 2017,Sun Sep 10 10:46:28 2017,Tue Oct 10 10:46:30 2017
108662,0,,US,1509679461,USD,1515800048,1.0,40000.0,1077399482,1510616048,...,1515800048,0.0,Digital Art,moe,Los Angeles,CA,Fri Nov 3 04:24:21 2017,Sat Jan 13 00:34:08 2018,Tue Nov 14 00:34:08 2017,Sat Jan 13 00:34:08 2018


In [16]:
# replace these missing values with the project name.
df_mis.loc[df_mis['blurb'].isnull(),'blurb'] = df_mis['name']
#df_mis.blurb[65168]

What do we do about creator name, location name and location state?

In [17]:
# replace these missing values with other values.
df_mis.loc[df_mis['creator_name'].isnull(),'creator_name'] = "John Doe"
df_mis.loc[df_mis['location_state'].isnull(),'location_state'] = df_mis['country']
df_mis.loc[df_mis['location_name'].isnull(),'location_name'] = df_mis['location_state']

### Feature engineering

In [18]:
# Goal in USD
df_mis["usd_goal"] = df_mis.goal * df_mis.fx_rate
# Remove column with exchange rate
df_feat1 = df_mis.drop(columns = ["goal","currency","fx_rate"])

In [19]:
# Project duration in days between launch and deadline
df_feat1["duration_days"] = round((df_feat1.deadline - df_feat1.launched_at)/(60*60*24))
# Remove columns with unix time stamps.
df_feat2 = df_feat1.drop(columns=["created_at", "deadline", "launched_at", "state_changed_at"])

In [20]:
# Difference between usd_pledged and usd_goal
df_feat2["d_pledged_goal_usd"] = df_feat2["usd_pledged"] - df_feat2["usd_goal"]
#df_feat2.head()

### Dealing with outliers of numeric features

In [21]:
# usd_goal: min is 100 USD, max is successfully pledged project with highest goal.
highest_goal = df_feat2[df_feat2.state == "successful"].usd_goal.max()
df = df_feat2.loc[((df_feat2.usd_goal >= 100.0) & (df_feat2.usd_goal <= highest_goal))]
df.usd_goal.describe()

count     166,192.00
mean       19,490.07
std        78,516.05
min           100.00
25%         1,502.45
50%         5,000.00
75%        13,256.80
max     2,000,000.00
Name: usd_goal, dtype: float64

In [22]:
# Check data loss
df_feat2.shape[0]-df.shape[0]

2787

In [23]:
# check for more outliers
df.describe()

Unnamed: 0,backers_count,id,usd_pledged,usd_goal,duration_days,d_pledged_goal_usd
count,166192.0,166192.0,166192.0,166192.0,166192.0,166192.0
mean,141.99,1073397001.68,12297.14,19490.07,32.72,-7192.93
std,901.95,619431950.59,84403.0,78516.05,11.69,106086.46
min,0.0,8624.0,0.0,100.0,1.0,-2000000.0
25%,3.0,535785761.75,101.69,1502.45,30.0,-4985.0
50%,26.0,1074926261.0,1524.22,5000.0,30.0,39.55
75%,86.0,1608503247.5,6372.99,13256.8,34.0,783.21
max,105857.0,2147476221.0,8596474.58,2000000.0,93.0,7646474.58


In [24]:
check1 = df[df.d_pledged_goal_usd == -2000000]
#check1

In [25]:
check2 = df[df.duration_days == 1]
#check2

In [26]:
# Save the cleaned dataset
df.to_csv('data/Kickstarter_cleaned.csv')

### Standardization

Should take place at fit_transform, not here.
* backers_count
* usd_pledged
* usd_goal
* duration_days
* d_pledged_goal_usd

In [27]:
## Potential candidate: power transformer. 
#from sklearn.preprocessing import PowerTransformer
## Yeo-Johnson for data with positive and negative values: 'd_pledged_goal_usd'
#PowerTransformer(method='yeo-johnson').fit_transform(X)
## Box-Cox for data with positive values only for the other four features
#PowerTransformer(method='box-cox').fit_transform(X)