In [275]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf 

%matplotlib inline

from datetime import datetime

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [276]:
location = "datasets/kickstartercombo.csv"

df = pd.read_csv(location)

In [277]:
df.shape

(702411, 16)

In [278]:
df.head()

Unnamed: 0.1,Unnamed: 0,ID,name,main_category,category,country,state,goal,pledged,currency,launched,deadline,backers,usd pledged,usd_goal_real,usd_pledged_real
0,0,1000002330,The Songs of Adelaide & Abullah,Publishing,Poetry,GB,failed,1000.0,0.0,GBP,2015-08-11 12:12:00,2015-10-09 11:36:00,0,0.0,,
1,1,1000004038,Where is Hank?,Film & Video,Narrative Film,US,failed,45000.0,220.0,USD,2013-01-12 00:20:00,2013-02-26 00:20:00,3,220.0,,
2,2,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,US,failed,5000.0,1.0,USD,2012-03-17 03:24:00,2012-04-16 04:24:00,1,1.0,,
3,3,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,US,canceled,19500.0,1283.0,USD,2015-07-04 08:35:00,2015-08-29 01:00:00,14,1283.0,,
4,4,1000014025,Monarch Espresso Bar,Food,Restaurants,US,successful,50000.0,52375.0,USD,2016-02-26 13:38:00,2016-04-01 13:38:00,224,52375.0,,


In [279]:
#Find out what columns the data set has
df.columns

Index(['Unnamed: 0', 'ID', 'name', 'main_category', 'category', 'country',
       'state', 'goal', 'pledged', 'currency', 'launched', 'deadline',
       'backers', 'usd pledged', 'usd_goal_real', 'usd_pledged_real'],
      dtype='object')

In [280]:
#look at which categories are most frequent
df['main_category'].mode()

0    Film & Video
dtype: object

In [281]:
df['category'].mode()

0    Product Design
dtype: object

In [282]:
#lookk at the average total amount pledged based on categories (both main and sub-categories)
df.groupby(['main_category','category'])['pledged'].mean().sort_values()



main_category   category         
Graphic Novels   Religion               25.000000
Crafts          Crochet                317.257415
                Candles                568.980217
Photography     Places                 813.854630
Crafts          Printing               893.309905
Music           Hip-Hop               1150.490350
Crafts          DIY                   1163.979437
Publishing      Poetry                1258.871557
Food            Bacon                 1307.441127
Publishing      Young Adult           1338.152190
Crafts          Glass                 1406.594205
Journalism      Video                 1442.760952
Food            Events                1451.652849
Publishing      Fiction               1483.527741
Film & Video    Experimental          1545.650880
Photography     Nature                1599.260704
Art             Conceptual Art        1636.587644
Music           Punk                  1648.897370
Food            Community Gardens     1668.545306
Journalism      

In [283]:
#want to look at difference between pledged and goal amount for successful projects, add as column

#first isolate successful projects; create dummy columns, then remove row indices where "successful projects" is False
#create "delta pledged" column to give difference (overaged) between pledged and raised amount
df2 = pd.get_dummies(data=df, columns=['state'])
df3 = df2.drop(['state_canceled','state_failed','state_suspended','state_undefined','state_live'],axis=1)
df3['delta pledged']=df3['usd_pledged_real']-df3['usd_goal_real']
index_not_successful = df3[df3['state_successful']!=1].index
df3.drop(index_not_successful, inplace=True)
df3.head()


Unnamed: 0.1,Unnamed: 0,ID,name,main_category,category,country,goal,pledged,currency,launched,deadline,backers,usd pledged,usd_goal_real,usd_pledged_real,state_successful,delta pledged
4,4,1000014025,Monarch Espresso Bar,Food,Restaurants,US,50000.0,52375.0,USD,2016-02-26 13:38:00,2016-04-01 13:38:00,224,52375.0,,,1,
5,5,1000023410,Support Solar Roasted Coffee & Green Energy! ...,Food,Food,US,1000.0,1205.0,USD,2014-12-01 18:30:00,2014-12-21 18:30:00,16,1205.0,,,1,
10,10,100005484,Lisa Lim New CD!,Music,Indie Rock,US,12500.0,12700.0,USD,2013-03-09 06:42:00,2013-04-08 06:42:00,100,12700.0,,,1,
16,16,1000070642,Mike Corey's Darkness & Light Album,Music,Music,US,250.0,250.0,USD,2012-08-02 14:11:00,2012-08-17 14:11:00,7,250.0,,,1,
18,18,1000072011,CMUK. Shoes: Take on Life Feet First.,Fashion,Fashion,US,20000.0,34268.0,USD,2013-11-25 07:06:00,2013-12-30 07:06:00,624,34268.0,,,1,


In [284]:
df3.sort_values(by=['delta pledged'], ascending=False)

Unnamed: 0.1,Unnamed: 0,ID,name,main_category,category,country,goal,pledged,currency,launched,deadline,backers,usd pledged,usd_goal_real,usd_pledged_real,state_successful,delta pledged
481020,481020,1799979574,"Pebble Time - Awesome Smartwatch, No Compromises",Design,Product Design,US,500000.0,20338986.27,USD,2015-02-24 15:44:42,2015-03-28,78471,2.033899e+07,500000.00,20338986.27,1,19838986.27
574004,574004,342886736,COOLEST COOLER: 21st Century Cooler that's Act...,Design,Product Design,US,50000.0,13285226.36,USD,2014-07-08 10:14:37,2014-08-30,62642,1.328523e+07,50000.00,13285226.36,1,13235226.36
613665,613665,545070200,Kingdom Death: Monster 1.5,Games,Tabletop Games,US,100000.0,12393139.69,USD,2016-11-25 06:01:41,2017-01-08,19264,5.228482e+06,100000.00,12393139.69,1,12293139.69
540379,540379,2103598555,"Pebble 2, Time 2 + All-New Pebble Core",Design,Product Design,US,1000000.0,12779843.49,USD,2016-05-24 15:49:52,2016-06-30,66673,1.277984e+07,1000000.00,12779843.49,1,11779843.49
606166,606166,506924864,Pebble: E-Paper Watch for iPhone and Android,Design,Product Design,US,100000.0,10266845.74,USD,2012-04-11 06:59:04,2012-05-19,68929,1.026685e+07,100000.00,10266845.74,1,10166845.74
617611,617611,565687737,The World's Best TRAVEL JACKET with 15 Feature...,Design,Product Design,US,20000.0,9192055.66,USD,2015-07-07 13:52:34,2015-09-03,44949,9.192056e+06,20000.00,9192055.66,1,9172055.66
511402,511402,1955357092,Exploding Kittens,Games,Tabletop Games,US,10000.0,8782571.99,USD,2015-01-20 19:00:19,2015-02-20,219382,8.782572e+06,10000.00,8782571.99,1,8772571.99
330415,330415,1033978702,OUYA: A New Kind of Video Game Console,Games,Gaming Hardware,US,950000.0,8596474.58,USD,2012-07-10 14:44:41,2012-08-09,63416,8.596475e+06,950000.00,8596474.58,1,7646474.58
633380,633380,647013276,"THE 7th CONTINENT – What Goes Up, Must Come Down.",Games,Tabletop Games,US,40000.0,7072757.00,USD,2017-09-26 20:00:02,2017-10-19,43733,1.574140e+06,40000.00,7072757.00,1,7032757.00
399650,399650,1386523707,Fidget Cube: A Vinyl Desk Toy,Design,Product Design,US,15000.0,6465690.30,USD,2016-08-30 22:02:09,2016-10-20,154926,1.377000e+04,15000.00,6465690.30,1,6450690.30


In [285]:
#want to see the relationship between the number of backers and whether the project is successful

#first, remove projects that are still live

df4 = df[df.state != 'live']

#now clean up by removing columns "Unnamed:0", "ID", "name"

df4 = df4.drop(['Unnamed: 0','ID','name'],axis=1)

#now look at the average number of backers for each project state

df4.groupby(['state'])['backers'].mean()

state
canceled       25.606675
failed         16.576774
successful    259.115265
suspended     108.983759
undefined       0.000000
Name: backers, dtype: float64

In [286]:
#look at correlation table, but have to assign numerical value to project states
#first isolate projects as either failed or successful. 
#For purposes of analysis, classify failed, canceled, suspended as "failed"

#first, remove "undefined". df4 had removed "live" entries, so additionally remove "undefined"
df5 = df4[df4.state != 'undefined']
#get dummy columns for other states
df5 = pd.get_dummies(data=df5, columns=['state'])

#classify anything other than "successful" as "unsuccessful", and add column with this conditional
df5['successful']=np.where((df5['state_successful'] != 1), 0, 1)
dfbinary = df5.drop(['state_failed','state_successful','state_suspended','state_canceled'],axis=1)
dfbinary.head()

Unnamed: 0,main_category,category,country,goal,pledged,currency,launched,deadline,backers,usd pledged,usd_goal_real,usd_pledged_real,successful
0,Publishing,Poetry,GB,1000.0,0.0,GBP,2015-08-11 12:12:00,2015-10-09 11:36:00,0,0.0,,,0
1,Film & Video,Narrative Film,US,45000.0,220.0,USD,2013-01-12 00:20:00,2013-02-26 00:20:00,3,220.0,,,0
2,Music,Music,US,5000.0,1.0,USD,2012-03-17 03:24:00,2012-04-16 04:24:00,1,1.0,,,0
3,Film & Video,Film & Video,US,19500.0,1283.0,USD,2015-07-04 08:35:00,2015-08-29 01:00:00,14,1283.0,,,0
4,Food,Restaurants,US,50000.0,52375.0,USD,2016-02-26 13:38:00,2016-04-01 13:38:00,224,52375.0,,,1


In [287]:
#now look at correlation table

dfbinary.corr()

Unnamed: 0,goal,pledged,backers,usd pledged,usd_goal_real,usd_pledged_real,successful
goal,1.0,0.007129,0.004481,0.005963,0.941676,0.005145,-0.024798
pledged,0.007129,1.0,0.730699,0.897975,0.005072,0.953367,0.10936
backers,0.004481,0.730699,1.0,0.706974,0.004544,0.752478,0.124255
usd pledged,0.005963,0.897975,0.706974,1.0,0.006244,0.908014,0.099553
usd_goal_real,0.941676,0.005072,0.004544,0.006244,1.0,0.005643,-0.023564
usd_pledged_real,0.005145,0.953367,0.752478,0.908014,0.005643,1.0,0.110547
successful,-0.024798,0.10936,0.124255,0.099553,-0.023564,0.110547,1.0


In [288]:
#looks like successful projects are correlated more highly to USD pledged than with number of backers
#also some slight indication that successful projects are correlated to lower funding goals


In [289]:
# now want to add a column that indicates the time to reach goal (deadline - launched), in number of days


dfbinary['launched']= pd.to_datetime(dfbinary['launched'])
dfbinary['deadline']=pd.to_datetime(dfbinary['deadline'])

dfbinary['project duration in days']=(dfbinary['deadline']-dfbinary['launched']).dt.days
dfbinary.head()


Unnamed: 0,main_category,category,country,goal,pledged,currency,launched,deadline,backers,usd pledged,usd_goal_real,usd_pledged_real,successful,project duration in days
0,Publishing,Poetry,GB,1000.0,0.0,GBP,2015-08-11 12:12:00,2015-10-09 11:36:00,0,0.0,,,0,58
1,Film & Video,Narrative Film,US,45000.0,220.0,USD,2013-01-12 00:20:00,2013-02-26 00:20:00,3,220.0,,,0,45
2,Music,Music,US,5000.0,1.0,USD,2012-03-17 03:24:00,2012-04-16 04:24:00,1,1.0,,,0,30
3,Film & Video,Film & Video,US,19500.0,1283.0,USD,2015-07-04 08:35:00,2015-08-29 01:00:00,14,1283.0,,,0,55
4,Food,Restaurants,US,50000.0,52375.0,USD,2016-02-26 13:38:00,2016-04-01 13:38:00,224,52375.0,,,1,35


In [290]:
dfbinary.dtypes

main_category                       object
category                            object
country                             object
goal                               float64
pledged                            float64
currency                            object
launched                    datetime64[ns]
deadline                    datetime64[ns]
backers                              int64
usd pledged                        float64
usd_goal_real                      float64
usd_pledged_real                   float64
successful                           int64
project duration in days             int64
dtype: object

In [291]:
#also want to add column that calculates pledged amount as a percentage of goal, in its own currency

dfbinary['percent of goal pledged']=(dfbinary['pledged']/dfbinary['goal'])*100
dfbinary.round({'percent of goal pledged':2})

Unnamed: 0,main_category,category,country,goal,pledged,currency,launched,deadline,backers,usd pledged,usd_goal_real,usd_pledged_real,successful,project duration in days,percent of goal pledged
0,Publishing,Poetry,GB,1000.0,0.00,GBP,2015-08-11 12:12:00,2015-10-09 11:36:00,0,0.000000,,,0,58,0.00
1,Film & Video,Narrative Film,US,45000.0,220.00,USD,2013-01-12 00:20:00,2013-02-26 00:20:00,3,220.000000,,,0,45,0.49
2,Music,Music,US,5000.0,1.00,USD,2012-03-17 03:24:00,2012-04-16 04:24:00,1,1.000000,,,0,30,0.02
3,Film & Video,Film & Video,US,19500.0,1283.00,USD,2015-07-04 08:35:00,2015-08-29 01:00:00,14,1283.000000,,,0,55,6.58
4,Food,Restaurants,US,50000.0,52375.00,USD,2016-02-26 13:38:00,2016-04-01 13:38:00,224,52375.000000,,,1,35,104.75
5,Food,Food,US,1000.0,1205.00,USD,2014-12-01 18:30:00,2014-12-21 18:30:00,16,1205.000000,,,1,20,120.50
6,Food,Drinks,US,25000.0,453.00,USD,2016-02-01 20:05:00,2016-03-17 19:05:00,40,453.000000,,,0,44,1.81
7,Design,Product Design,US,125000.0,8233.00,USD,2014-04-24 18:14:00,2014-05-29 18:14:00,58,8233.000000,,,0,35,6.59
8,Film & Video,Documentary,US,65000.0,6240.57,USD,2014-07-11 21:55:00,2014-08-10 21:55:00,43,6240.570000,,,0,30,9.60
9,Publishing,Nonfiction,CA,2500.0,0.00,CAD,2013-09-09 18:19:00,2013-10-09 18:19:00,0,0.000000,,,0,30,0.00


In [292]:
dfbinary.corr()

Unnamed: 0,goal,pledged,backers,usd pledged,usd_goal_real,usd_pledged_real,successful,project duration in days,percent of goal pledged
goal,1.0,0.007129,0.004481,0.005963,0.941676,0.005145,-0.024798,0.004089,-0.000469
pledged,0.007129,1.0,0.730699,0.897975,0.005072,0.953367,0.10936,0.000627,0.008669
backers,0.004481,0.730699,1.0,0.706974,0.004544,0.752478,0.124255,-0.000919,0.016796
usd pledged,0.005963,0.897975,0.706974,1.0,0.006244,0.908014,0.099553,0.000667,0.006725
usd_goal_real,0.941676,0.005072,0.004544,0.006244,1.0,0.005643,-0.023564,0.004194,-0.000452
usd_pledged_real,0.005145,0.953367,0.752478,0.908014,0.005643,1.0,0.110547,0.000931,0.00899
successful,-0.024798,0.10936,0.124255,0.099553,-0.023564,0.110547,1.0,-0.025201,0.01448
project duration in days,0.004089,0.000627,-0.000919,0.000667,0.004194,0.000931,-0.025201,1.0,0.000183
percent of goal pledged,-0.000469,0.008669,0.016796,0.006725,-0.000452,0.00899,0.01448,0.000183,1.0


In [293]:
dfbinary.count()

main_category               688049
category                    688049
country                     688049
goal                        688049
pledged                     688049
currency                    688049
launched                    688049
deadline                    688049
backers                     688049
usd pledged                 687581
usd_goal_real               372300
usd_pledged_real            372300
successful                  688049
project duration in days    688049
percent of goal pledged     688049
dtype: int64

In [294]:
#indicates that success is correlated to # of backers more than amount pledged
#also that it is negatively correlated to project duration


In [295]:
#now want to see the average number of backers and the average project duration for each project type
dfbinarygrouped = dfbinary.groupby(['main_category','category'])['backers','project duration in days'].mean()
dfbinarygrouped.reset_index()
dfbinarygrouped.rename(columns={'backers':'mean backers','project duration in days':'mean days duration'}, inplace=True)
dfbinarygrouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mean backers,mean days duration
main_category,category,Unnamed: 2_level_1,Unnamed: 3_level_1
Art,Art,44.021707,34.332335
Art,Ceramics,48.131931,30.753346
Art,Conceptual Art,27.353002,31.946687
Art,Digital Art,32.861179,31.724816
Art,Illustration,62.554634,30.107426


In [296]:
# now sort by number of backers and then by mean days duration
dfbinarygrouped.sort_values(by=['mean backers', 'mean days duration'],ascending=False)


Unnamed: 0_level_0,Unnamed: 1_level_0,mean backers,mean days duration
main_category,category,Unnamed: 2_level_1,Unnamed: 3_level_1
Music,Chiptune,455.823529,33.161765
Games,Tabletop Games,421.439795,30.473188
Technology,Camera Equipment,403.133523,33.569602
Games,Video Games,387.619074,33.379496
Technology,Wearables,374.339450,35.026557
Technology,Sound,355.462087,33.983051
Technology,Space Exploration,327.226876,34.448517
Design,Typography,324.384615,31.912088
Technology,Hardware,318.381595,35.014226
Technology,Gadgets,312.412762,35.075972


In [297]:
#now want to look at the median number of backers
dfbinarygrouped['mean backers'].median()

48.709642548284805

In [298]:
#see if can get logistical regression model to predict success

#remove null values for USD_goal_real, usd pledged, and USD_pledged_real

dfmodel.isnull().sum()

goal                        0
pledged                     0
backers                     0
usd pledged                 0
usd_goal_real               0
usd_pledged_real            0
successful                  0
project duration in days    0
percent of goal pledged     0
dtype: int64

In [299]:
dfbinary1 = dfbinary.dropna()
dfbinary1.isnull().sum()

main_category               0
category                    0
country                     0
goal                        0
pledged                     0
currency                    0
launched                    0
deadline                    0
backers                     0
usd pledged                 0
usd_goal_real               0
usd_pledged_real            0
successful                  0
project duration in days    0
percent of goal pledged     0
dtype: int64

In [307]:
#isolate variables that I think are most highly correlated: 

dfmodel = dfbinary1.drop(['launched','deadline','currency','main_category','category','country'],axis=1)
dfmodel.dtypes

goal                        float64
pledged                     float64
backers                       int64
usd pledged                 float64
usd_goal_real               float64
usd_pledged_real            float64
successful                    int64
project duration in days      int64
percent of goal pledged     float64
dtype: object

In [301]:
#create logistical regression model to predict success

y = dfmodel['successful']
x = dfmodel.drop(['successful'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=15)
LogReg = LogisticRegression()
LogReg.fit(x_train, y_train)




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [302]:
LogReg.score(x_train, y_train)

0.9924341176944889

In [303]:
y_pred = LogReg.predict(x_test)

In [305]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99     47493
           1       0.98      1.00      0.99     26921

   micro avg       0.99      0.99      0.99     74414
   macro avg       0.99      0.99      0.99     74414
weighted avg       0.99      0.99      0.99     74414



In [None]:
#My models indicate that focusing on number of backers, project duration, and project goal correlates most highly to prediciton of success