# Pre Processing and Training Data: Kickstarter

For the next stage of my project, I need to normalize and standardize all of the features of my data, as well as create a validation set.

In [168]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn import preprocessing
import numpy as np
from sklearn.model_selection import train_test_split

In [169]:
ks_clean = pd.read_csv('ks_clean2.csv')

In [170]:
ks_clean.head()

Unnamed: 0.1,Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged,ym-date
0,117051,1703704063,drawing for dollars,Illustration,Art,USD,2009-05-03 08:59:59,20.0,2009-04-24 21:52:03,35.0,successful,3,US,35.0,2009-04-01 00:00:00
1,27852,1167151653,Smogr Alert Field Recording,Graphic Design,Design,USD,2009-07-04 18:40:00,640.0,2009-04-29 22:08:13,41.0,failed,3,US,41.0,2009-04-01 00:00:00
2,143091,1860890148,Grace Jones Does Not Give A F$#% T-Shirt (limi...,Fashion,Fashion,USD,2009-05-31 08:59:59,1000.0,2009-04-21 21:02:48,625.0,failed,30,US,625.0,2009-04-01 00:00:00
3,288861,813230527,Sponsor Dereck Blackburn (Lostwars) Artist in ...,Rock,Music,USD,2009-05-16 01:10:00,300.0,2009-04-29 05:26:32,15.0,failed,2,US,15.0,2009-04-01 00:00:00
4,232330,469734648,kicey to iceland,Photography,Photography,USD,2009-06-17 06:00:00,350.0,2009-04-29 06:43:44,1630.0,successful,31,US,1630.0,2009-04-01 00:00:00


In [171]:
ks_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 331672 entries, 0 to 331671
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Unnamed: 0     331672 non-null  int64  
 1   ID             331672 non-null  int64  
 2   name           331672 non-null  object 
 3   category       331672 non-null  object 
 4   main_category  331672 non-null  object 
 5   currency       331672 non-null  object 
 6   deadline       331672 non-null  object 
 7   goal           331672 non-null  float64
 8   launched       331672 non-null  object 
 9   pledged        331672 non-null  float64
 10  state          331672 non-null  object 
 11  backers        331672 non-null  int64  
 12  country        331672 non-null  object 
 13  usd_pledged    331672 non-null  float64
 14  ym-date        331672 non-null  object 
dtypes: float64(3), int64(3), object(9)
memory usage: 38.0+ MB


In this step, I set up dummy variables for all my integer variables. and then concatenated them back into the main dataframe.

In [172]:
dfo=ks_clean.select_dtypes(include=['object']) # select object type columns

In [173]:
dummy2 = pd.get_dummies(dfo['main_category'])

In [174]:
dummy3 = pd.get_dummies(dfo['currency'])

In [175]:
dummy4 = pd.get_dummies(dfo['state'])

In [176]:
dummy5 = pd.get_dummies(dfo['country'])

In [177]:
ks_clean = pd.concat([ks_clean, dummy2], axis=1)

In [178]:
ks_clean = pd.concat([ks_clean, dummy3], axis=1)

In [179]:
ks_clean = pd.concat([ks_clean, dummy4], axis=1)

In [180]:
ks_clean = pd.concat([ks_clean, dummy5], axis=1)

In [181]:
ks_clean['main_category'].nunique()

15

In [182]:
ks_clean.head()

Unnamed: 0.1,Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,...,JP,LU,MX,"N,""0",NL,NO,NZ,SE,SG,US
0,117051,1703704063,drawing for dollars,Illustration,Art,USD,2009-05-03 08:59:59,20.0,2009-04-24 21:52:03,35.0,...,0,0,0,0,0,0,0,0,0,1
1,27852,1167151653,Smogr Alert Field Recording,Graphic Design,Design,USD,2009-07-04 18:40:00,640.0,2009-04-29 22:08:13,41.0,...,0,0,0,0,0,0,0,0,0,1
2,143091,1860890148,Grace Jones Does Not Give A F$#% T-Shirt (limi...,Fashion,Fashion,USD,2009-05-31 08:59:59,1000.0,2009-04-21 21:02:48,625.0,...,0,0,0,0,0,0,0,0,0,1
3,288861,813230527,Sponsor Dereck Blackburn (Lostwars) Artist in ...,Rock,Music,USD,2009-05-16 01:10:00,300.0,2009-04-29 05:26:32,15.0,...,0,0,0,0,0,0,0,0,0,1
4,232330,469734648,kicey to iceland,Photography,Photography,USD,2009-06-17 06:00:00,350.0,2009-04-29 06:43:44,1630.0,...,0,0,0,0,0,0,0,0,0,1


Here I standardized the magnitude of the features using a scaler.

In [183]:
scaler = preprocessing.StandardScaler() 

In [184]:
ks_num = ks_clean.select_dtypes(include=['int64','float64','uint8'])

In [185]:
ks_num.head()

Unnamed: 0.1,Unnamed: 0,ID,goal,pledged,backers,usd_pledged,Art,Comics,Crafts,Dance,...,JP,LU,MX,"N,""0",NL,NO,NZ,SE,SG,US
0,117051,1703704063,20.0,35.0,3,35.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,27852,1167151653,640.0,41.0,3,41.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,143091,1860890148,1000.0,625.0,30,625.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,288861,813230527,300.0,15.0,2,15.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,232330,469734648,350.0,1630.0,31,1630.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


There were a few unnecessary rows and columns left over from pervious steps, so I cleaned them up here. 

In [186]:
ks_num.drop('Unnamed: 0', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [187]:
ks_num.drop('ID', axis=1, inplace=True)

In [188]:
ks_num.drop('N,"0',axis=1,inplace=True)

In [189]:
print(ks_num.columns.tolist())

['goal', 'pledged', 'backers', 'usd_pledged', 'Art', 'Comics', 'Crafts', 'Dance', 'Design', 'Fashion', 'Film & Video', 'Food', 'Games', 'Journalism', 'Music', 'Photography', 'Publishing', 'Technology', 'Theater', 'AUD', 'CAD', 'CHF', 'DKK', 'EUR', 'GBP', 'HKD', 'JPY', 'MXN', 'NOK', 'NZD', 'SEK', 'SGD', 'USD', 'failed', 'successful', 'AT', 'AU', 'BE', 'CA', 'CH', 'DE', 'DK', 'ES', 'FR', 'GB', 'HK', 'IE', 'IT', 'JP', 'LU', 'MX', 'NL', 'NO', 'NZ', 'SE', 'SG', 'US']


Here I isolated the numeric values for the scaler into a seperate dataframe, scaled_ks.

In [190]:
scaled_ks = scaler.fit_transform(ks_num) 

In [191]:
scaled_ks = pd.DataFrame(scaled_ks, columns=['goal', 'pledged', 'backers', 'usd_pledged', 'Art', 'Comics', 'Crafts', 'Dance', 'Design', 'Fashion', 'Film & Video', 'Food', 'Games', 'Journalism', 'Music', 'Photography', 'Publishing', 'Technology', 'Theater', 'AUD', 'CAD', 'CHF', 'DKK', 'EUR', 'GBP', 'HKD', 'JPY', 'MXN', 'NOK', 'NZD', 'SEK', 'SGD', 'USD', 'failed', 'successful', 'AT', 'AU', 'BE', 'CA', 'CH', 'DE', 'DK', 'ES', 'FR', 'GB', 'HK', 'IE', 'IT', 'JP', 'LU', 'MX', 'NL', 'NO', 'NZ', 'SE', 'SG', 'US'])

In [192]:
scaled_ks.head()

Unnamed: 0,goal,pledged,backers,usd_pledged,Art,Comics,Crafts,Dance,Design,Fashion,...,IT,JP,LU,MX,NL,NO,NZ,SE,SG,US
0,-0.039566,-0.103838,-0.117443,-0.091902,3.454812,-0.175205,-0.155372,-0.104355,-0.28776,-0.251798,...,-0.084817,-0.008328,-0.013111,-0.065363,-0.085571,-0.041926,-0.062096,-0.067605,-0.037023,0.518684
1,-0.039011,-0.103779,-0.117443,-0.09183,-0.289451,-0.175205,-0.155372,-0.104355,3.475124,-0.251798,...,-0.084817,-0.008328,-0.013111,-0.065363,-0.085571,-0.041926,-0.062096,-0.067605,-0.037023,0.518684
2,-0.038689,-0.09803,-0.089476,-0.08481,-0.289451,-0.175205,-0.155372,-0.104355,-0.28776,3.971434,...,-0.084817,-0.008328,-0.013111,-0.065363,-0.085571,-0.041926,-0.062096,-0.067605,-0.037023,0.518684
3,-0.039315,-0.104035,-0.118478,-0.092143,-0.289451,-0.175205,-0.155372,-0.104355,-0.28776,-0.251798,...,-0.084817,-0.008328,-0.013111,-0.065363,-0.085571,-0.041926,-0.062096,-0.067605,-0.037023,0.518684
4,-0.03927,-0.088138,-0.08844,-0.072728,-0.289451,-0.175205,-0.155372,-0.104355,-0.28776,-0.251798,...,-0.084817,-0.008328,-0.013111,-0.065363,-0.085571,-0.041926,-0.062096,-0.067605,-0.037023,0.518684


With the data scaled, I finished by splitting up the training and the testing data.

In [193]:
training_data, testing_data = train_test_split(scaled_ks, test_size=0.2, random_state=25)

In [194]:
print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

No. of training examples: 265337
No. of testing examples: 66335
