# Kickstarter Preprocessing

## Preprocess the data. Balance the dataset. Create 3 datasets: training, validation, and test. Save the newly created sets in a tensor friendly format (e.g. *.npz)

In [100]:
#Import libraries 
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers

# We will use the sklearn preprocessing library, as it will be easier to standardize the data.
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

## Visualize the data

In [122]:
# GENERAL : Read the data set
kickstarter = pd.read_csv('ks-projects-201801-clean.csv')
kickstarter.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,launched_year,period,funded_ratio
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09 00:00:00,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,2015,58 days 11:47:32.000000000,0.0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01 00:00:00,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,2017,59 days 19:16:03.000000000,0.0807
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26 00:00:00,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,2013,44 days 23:39:10.000000000,0.004889
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16 00:00:00,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,2012,29 days 20:35:49.000000000,0.0002
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29 00:00:00,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0,2015,55 days 15:24:57.000000000,0.065795


In [123]:
kickstarter.dtypes

ID                    int64
name                 object
category             object
main_category        object
currency             object
deadline             object
goal                float64
launched             object
pledged             float64
state                object
backers               int64
country              object
usd pledged         float64
usd_pledged_real    float64
usd_goal_real       float64
launched_year         int64
period               object
funded_ratio        float64
dtype: object

In [124]:
# Remove columns that won't be used
kickstarter = kickstarter.drop(['category','ID', 'name', 'deadline', 'launched', 'launched_year', 'currency', 'goal', 'pledged', 'usd pledged'], axis=1)
kickstarter.head()

Unnamed: 0,main_category,state,backers,country,usd_pledged_real,usd_goal_real,period,funded_ratio
0,Publishing,failed,0,GB,0.0,1533.95,58 days 11:47:32.000000000,0.0
1,Film & Video,failed,15,US,2421.0,30000.0,59 days 19:16:03.000000000,0.0807
2,Film & Video,failed,3,US,220.0,45000.0,44 days 23:39:10.000000000,0.004889
3,Music,failed,1,US,1.0,5000.0,29 days 20:35:49.000000000,0.0002
4,Film & Video,canceled,14,US,1283.0,19500.0,55 days 15:24:57.000000000,0.065795


In [125]:
# Convert state column to numbers
cleanup_state = {"state": {"failed": 0, "canceled": 0, "successful": 1}}
kickstarter.replace(cleanup_state, inplace=True)
kickstarter.head()

Unnamed: 0,main_category,state,backers,country,usd_pledged_real,usd_goal_real,period,funded_ratio
0,Publishing,0,0,GB,0.0,1533.95,58 days 11:47:32.000000000,0.0
1,Film & Video,0,15,US,2421.0,30000.0,59 days 19:16:03.000000000,0.0807
2,Film & Video,0,3,US,220.0,45000.0,44 days 23:39:10.000000000,0.004889
3,Music,0,1,US,1.0,5000.0,29 days 20:35:49.000000000,0.0002
4,Film & Video,0,14,US,1283.0,19500.0,55 days 15:24:57.000000000,0.065795


In [126]:
# Count number of ones and ceros
state_counts = kickstarter ['state'].value_counts()
state_percent = kickstarter.state.value_counts(normalize=True).mul(100).round(1).astype(str)+'%'
pd.DataFrame({'counts':state_counts,'percent': state_percent})

Unnamed: 0,counts,percent
0,233247,63.8%
1,132266,36.2%


In [127]:
# Parse days of period to integer
kickstarter['period'] = kickstarter['period'].str[:2]

In [128]:
kickstarter['main_category'] = kickstarter['main_category'].astype(str)
kickstarter['state'] = kickstarter['state'].astype(int)
kickstarter['backers'] = kickstarter['backers'].astype(int)
kickstarter['country'] = kickstarter['country'].astype(str)
kickstarter['usd_pledged_real'] = kickstarter['usd_pledged_real'].astype(float)
kickstarter['usd_goal_real'] = kickstarter['usd_goal_real'].astype(float)
kickstarter['period'] = kickstarter['period'].astype(int)
kickstarter['funded_ratio'] = kickstarter['funded_ratio'].astype(float)

In [140]:
kickstarter.dtypes

state                           int32
backers                         int32
usd_pledged_real              float64
usd_goal_real                 float64
period                          int32
funded_ratio                  float64
country_GB                      uint8
country_Other                   uint8
country_US                      uint8
main_category_Art               uint8
main_category_Comics            uint8
main_category_Crafts            uint8
main_category_Dance             uint8
main_category_Design            uint8
main_category_Fashion           uint8
main_category_Film & Video      uint8
main_category_Food              uint8
main_category_Games             uint8
main_category_Journalism        uint8
main_category_Music             uint8
main_category_Photography       uint8
main_category_Publishing        uint8
main_category_Technology        uint8
main_category_Theater           uint8
dtype: object

In [133]:
kickstarter['country'] = kickstarter['country'].str.replace('CA', 'Other')
kickstarter['country'] = kickstarter['country'].str.replace('AU', 'Other')
kickstarter['country'] = kickstarter['country'].str.replace('DE', 'Other')
kickstarter['country'] = kickstarter['country'].str.replace('FR', 'Other')
kickstarter['country'] = kickstarter['country'].str.replace('NL', 'Other')
kickstarter['country'] = kickstarter['country'].str.replace('IT', 'Other')
kickstarter['country'] = kickstarter['country'].str.replace('ES', 'Other')
kickstarter['country'] = kickstarter['country'].str.replace('SE', 'Other')
kickstarter['country'] = kickstarter['country'].str.replace('MX', 'Other')
kickstarter['country'] = kickstarter['country'].str.replace('NZ', 'Other')
kickstarter['country'] = kickstarter['country'].str.replace('DK', 'Other')
kickstarter['country'] = kickstarter['country'].str.replace('IE', 'Other')
kickstarter['country'] = kickstarter['country'].str.replace('CH', 'Other')
kickstarter['country'] = kickstarter['country'].str.replace('NO', 'Other')
kickstarter['country'] = kickstarter['country'].str.replace('BE', 'Other')
kickstarter['country'] = kickstarter['country'].str.replace('AT', 'Other')
kickstarter['country'] = kickstarter['country'].str.replace('HK', 'Other')
kickstarter['country'] = kickstarter['country'].str.replace('SG', 'Other')
kickstarter['country'] = kickstarter['country'].str.replace('LU', 'Other')
kickstarter['country'] = kickstarter['country'].str.replace('JP', 'Other')

In [134]:
#How many values of country do we have?
country_counts = kickstarter ['country'].value_counts()
country_percent = kickstarter.country.value_counts(normalize=True).mul(100).round(1).astype(str)+'%'
pd.DataFrame({'counts':country_counts,'percent': country_percent})

Unnamed: 0,counts,percent
US,284972,78.0%
Other,47328,12.9%
GB,33213,9.1%


In [137]:
# One-hot encoding for country
kickstarter = pd.get_dummies(kickstarter, columns=["country"])

In [139]:
# One-hot encoding for main_category
kickstarter = pd.get_dummies(kickstarter, columns=["main_category"])

In [144]:
kickstarter.head()

Unnamed: 0,state,backers,usd_pledged_real,usd_goal_real,period,funded_ratio,country_GB,country_Other,country_US,main_category_Art,...,main_category_Fashion,main_category_Film & Video,main_category_Food,main_category_Games,main_category_Journalism,main_category_Music,main_category_Photography,main_category_Publishing,main_category_Technology,main_category_Theater
0,0,0,0.0,1533.95,58,0.0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,15,2421.0,30000.0,59,0.0807,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
2,0,3,220.0,45000.0,44,0.004889,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
3,0,1,1.0,5000.0,29,0.0002,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,0,14,1283.0,19500.0,55,0.065795,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [153]:
kickstarter.dtypes

state                           int32
backers                         int32
usd_pledged_real              float64
usd_goal_real                 float64
period                          int32
funded_ratio                  float64
country_GB                      uint8
country_Other                   uint8
country_US                      uint8
main_category_Art               uint8
main_category_Comics            uint8
main_category_Crafts            uint8
main_category_Dance             uint8
main_category_Design            uint8
main_category_Fashion           uint8
main_category_Film & Video      uint8
main_category_Food              uint8
main_category_Games             uint8
main_category_Journalism        uint8
main_category_Music             uint8
main_category_Photography       uint8
main_category_Publishing        uint8
main_category_Technology        uint8
main_category_Theater           uint8
dtype: object

In [170]:
# Pandas Dataframe to Numpy Array
numpyMatrix = kickstarter.to_numpy().astype(float)

In [174]:
#SAVING THE FILE
np.savetxt('ks-projects-201801-one-hot.csv', numpyMatrix, delimiter=',')