In [2]:
import pandas as pd
import numpy as np
import zipfile

In [3]:
dataset_file = ("https://github.com/DSPT8-Kickstarter/main/blob/main/data/Kickstarter_Projects.csv.zip")

In [4]:
df = pd.read_csv('/content/ks-projects-201801.csv.zip')

In [5]:
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [6]:
df.shape

(378661, 15)

In [7]:
df['state'].value_counts()

failed        197719
successful    133956
canceled       38779
undefined       3562
live            2799
suspended       1846
Name: state, dtype: int64

In [8]:
# removed all values in the other than binary.
df2 = df[df['state'].isin(['failed', 'successful'])]
df2['state'].value_counts()

failed        197719
successful    133956
Name: state, dtype: int64

In [9]:
df2.shape

# This shows that we have knocked the dataset size down from 378661 to 331675

(331675, 15)

# With the dataset reduced, we can get started on a model.

In [51]:
df2['success'].value_counts(normalize=True)

0    0.596123
1    0.403877
Name: success, dtype: float64

In [10]:
df2['success'] = np.where(df2['state'] == "successful", 1, 0)
print(df2['success'].value_counts())
df2.head()

0    197719
1    133956
Name: success, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,success
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,0
5,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,successful,224,US,52375.0,52375.0,50000.0,1


In [11]:
df2['main_category'].value_counts()

Film & Video    56527
Music           45949
Publishing      35445
Games           28521
Technology      27050
Art             25641
Design          25364
Food            22054
Fashion         19775
Theater         10242
Comics           9878
Photography      9689
Crafts           7818
Journalism       4149
Dance            3573
Name: main_category, dtype: int64

In [12]:
df2['main_category'].nunique()

15

In [13]:
# at 15 main categories, one-hot encoding looks to be appropriate.

In [14]:
df2['category'].value_counts()

Product Design     18680
Documentary        14523
Music              12633
Tabletop Games     11744
Shorts             11394
                   ...  
Residencies           68
Letterpress           46
Chiptune              33
Literary Spaces       19
Taxidermy             10
Name: category, Length: 159, dtype: int64

In [15]:
df2['category'].nunique()

# There are 159 different categories - too much to one-hot encode!  

# "Learned embedding" might be a more appropriate technique:
# https://machinelearningmastery.com/how-to-prepare-categorical-data-for-deep-learning-in-python/

159

In [16]:
# getting the objects for review

obj_df = df2.select_dtypes(include=['object']).copy()
obj_df.head()

#from here, let's take out the months.

Unnamed: 0,name,category,main_category,currency,deadline,launched,state,country
0,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,2015-08-11 12:12:28,failed,GB
1,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,2017-09-02 04:43:57,failed,US
2,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,2013-01-12 00:20:50,failed,US
3,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,2012-03-17 03:24:11,failed,US
5,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,2016-02-26 13:38:27,successful,US


In [17]:
df2['launched'] = pd.to_datetime(df2['launched'])
df2['deadline'] = pd.to_datetime(df2['deadline'])
df2['launched'].dtype

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


dtype('<M8[ns]')

In [18]:
df2['days'] = (df2['deadline']-df2['launched']).astype('timedelta64[D]')
df2['days'] = df2['days'].astype(int)
df2['days'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


0    58
1    59
2    44
3    29
5    34
Name: days, dtype: int64

In [19]:
df2['days'].value_counts()

29    148478
59     27969
44     15241
30     11724
34      8619
       ...  
81        46
76        45
73        43
78        35
91        22
Name: days, Length: 92, dtype: int64

In [20]:
print(df2['deadline'][0])
print(df2['launched'][0])

2015-10-09 00:00:00
2015-08-11 12:12:28


In [21]:
df2['month_launched'] = pd.DatetimeIndex(df2['launched']).month
df2['month_launched'].head()

unencoded_df = df2



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [22]:
df2.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,success,days,month_launched
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,0,58,8
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,0,59,9
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,0,44,1
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,0,29,3
5,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,successful,224,US,52375.0,52375.0,50000.0,1,34,2


In [23]:
!pip install category_encoders



In [24]:
import category_encoders as ce

cols = ['main_category', 'currency']
encoder = ce.OneHotEncoder(cols=cols, use_cat_names=True, )
encoded_df = encoder.fit_transform(df2)

  import pandas.util.testing as tm
  elif pd.api.types.is_categorical(cols):


In [25]:
encoded_df.head()

Unnamed: 0,ID,name,category,main_category_Publishing,main_category_Film & Video,main_category_Music,main_category_Food,main_category_Crafts,main_category_Games,main_category_Design,main_category_Comics,main_category_Fashion,main_category_Theater,main_category_Art,main_category_Photography,main_category_Technology,main_category_Dance,main_category_Journalism,currency_GBP,currency_USD,currency_CAD,currency_AUD,currency_NOK,currency_EUR,currency_MXN,currency_SEK,currency_NZD,currency_CHF,currency_DKK,currency_HKD,currency_SGD,currency_JPY,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,success,days,month_launched
0,1000002330,The Songs of Adelaide & Abullah,Poetry,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,0,58,8
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,0,59,9
2,1000004038,Where is Hank?,Narrative Film,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,0,44,1
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,0,29,3
5,1000014025,Monarch Espresso Bar,Restaurants,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,successful,224,US,52375.0,52375.0,50000.0,1,34,2


In [26]:
df2['currency'].value_counts()

USD    261511
GBP     29476
EUR     14378
CAD     12375
AUD      6621
SEK      1510
MXN      1411
NZD      1274
DKK       929
CHF       652
NOK       584
HKD       477
SGD       454
JPY        23
Name: currency, dtype: int64

In [27]:
encoded_df.drop(['name', 'category', 'deadline', 'launched', 'pledged', 'state'], axis=1, inplace=True)

In [28]:
encoded_df.head()

Unnamed: 0,ID,main_category_Publishing,main_category_Film & Video,main_category_Music,main_category_Food,main_category_Crafts,main_category_Games,main_category_Design,main_category_Comics,main_category_Fashion,main_category_Theater,main_category_Art,main_category_Photography,main_category_Technology,main_category_Dance,main_category_Journalism,currency_GBP,currency_USD,currency_CAD,currency_AUD,currency_NOK,currency_EUR,currency_MXN,currency_SEK,currency_NZD,currency_CHF,currency_DKK,currency_HKD,currency_SGD,currency_JPY,goal,backers,country,usd pledged,usd_pledged_real,usd_goal_real,success,days,month_launched
0,1000002330,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1000.0,0,GB,0.0,0.0,1533.95,0,58,8
1,1000003930,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,30000.0,15,US,100.0,2421.0,30000.0,0,59,9
2,1000004038,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,45000.0,3,US,220.0,220.0,45000.0,0,44,1
3,1000007540,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,5000.0,1,US,1.0,1.0,5000.0,0,29,3
5,1000014025,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,50000.0,224,US,52375.0,52375.0,50000.0,1,34,2


In [29]:
# Since we are not using the number of backers, nor predicting - this data will be best left out.  
# This model is not used to predict this.

encoded_df.drop(['usd pledged', 'usd_pledged_real', 'backers', 'country', 'usd_goal_real', 'ID'], axis=1, inplace=True)




In [30]:
# Let's split 'em for the model

from sklearn.model_selection import train_test_split

X = encoded_df.drop(['success'],axis=1).values
y = encoded_df['success'].values	

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [31]:
# This is the model I will be using for the learned embedding

# ...  maybe not.  But, I'll leave it here just in case.
# from sklearn.preprocessing import LabelEncoder
#
# def prepare_inputs(X_train, X_test):
# 	X_train_enc, X_test_enc = list(), list()
# 	# label encode each column
# 	for i in range(X_train.shape[1]):
# 		le = LabelEncoder()
# 		le.fit(X_train[:, i])
# 		# encode
# 		train_enc = le.transform(X_train[:, i])
# 		test_enc = le.transform(X_test[:, i])
# 		# store
# 		X_train_enc.append(train_enc)
# 		X_test_enc.append(test_enc)

In [32]:
# Imports for the machine learning model

from keras.models import Sequential
from keras.layers import Dense

In [33]:
# changing approach, abandoning the train/test split, upon further review:

encoded_df[[c for c in encoded_df if c not in ['success']] 
       + ['success']]


Unnamed: 0,main_category_Publishing,main_category_Film & Video,main_category_Music,main_category_Food,main_category_Crafts,main_category_Games,main_category_Design,main_category_Comics,main_category_Fashion,main_category_Theater,main_category_Art,main_category_Photography,main_category_Technology,main_category_Dance,main_category_Journalism,currency_GBP,currency_USD,currency_CAD,currency_AUD,currency_NOK,currency_EUR,currency_MXN,currency_SEK,currency_NZD,currency_CHF,currency_DKK,currency_HKD,currency_SGD,currency_JPY,goal,days,month_launched,success
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1000.0,58,8,0
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,30000.0,59,9,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,45000.0,44,1,0
3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,5000.0,29,3,0
5,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,50000.0,34,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378654,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,6500.0,29,3,0
378657,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1500.0,26,6,0
378658,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,15000.0,45,7,0
378659,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,15000.0,30,1,0


In [34]:
# encoded_df[[:, :-1]].values

In [35]:
encoded_columns = list(encoded_df.columns)

In [36]:
# Do this later #

# Setting to X,y values, with the y value reshaped to be a 2d array
y = y.reshape((len(y), 1))

print(X.shape)
print(y.shape)

(331675, 32)
(331675, 1)


In [37]:
# Baseline value for binary classification of success rate

## Get baseline by average of success/failure from target

In [38]:
# simple perceptron model

model1 = Sequential()
model1.add(Dense(1,input_dim=32, activation='sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
model1.fit(X,y, epochs=10);

Epoch 1/10
Epoch 2/10
Epoch 3/10
   83/10365 [..............................] - ETA: 19s - loss: 2047927.6469 - accuracy: 0.5610

KeyboardInterrupt: ignored

In [None]:
scores = model1.evaluate(X, y)
print(f"{model1.metrics_names[1]}: {scores[1]*100}")

#49% for a baseline

In [39]:
# Consider more neurons - layers and neurons

from tensorflow.keras.layers import LSTM, Dropout

model2 = Sequential()
model2.add(Dense(32, input_dim=32, activation='sigmoid'))
model2.add(Dropout(0.25))
model2.add(Dense(64, activation='sigmoid'))
model2.add(Dropout(0.25))
model2.add(Dense(32, activation='sigmoid'))
model2.add(Dense(units=1, activation='sigmoid'))

In [40]:
# Consider bringing in the SGD module to be able to make changes - 
# increase learning rate (back propagation - increase to converge faster?)

# Take a look into the loss. (look into the gammas and betas (regularization penalizing accuracy))


model2.compile(optimizer='sgd',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [41]:
model2.fit(X, y,
          epochs=10, batch_size=16)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f7e900aabd0>

In [56]:
scores = model2.evaluate(X, y)
print(f"{model2.metrics_names[1]}: {scores[1]*100}")

accuracy: 59.75216627120972


In [48]:
model3 = Sequential()
model3.add(Dense(128, input_dim=32, activation='sigmoid'))
model3.add(Dropout(0.25))
model3.add(Dense(256, activation='sigmoid'))
model3.add(Dropout(0.25))
model3.add(Dense(256, activation='sigmoid'))
model3.add(Dropout(0.25))
model3.add(Dense(256, activation='sigmoid'))
model3.add(Dropout(0.25))
model3.add(Dense(128, activation='sigmoid'))
model3.add(Dropout(0.25))
model3.add(Dense(128, activation='sigmoid'))
model3.add(Dropout(0.25))
model3.add(Dense(64, activation='sigmoid'))
model3.add(Dropout(0.25))
model3.add(Dense(32, activation='sigmoid'))
model3.add(Dense(units=1, activation='sigmoid'))

In [49]:
model3.compile(optimizer='sgd',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [50]:
model3.fit(X, y,
          epochs=10, batch_size=16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
 4574/20730 [=====>........................] - ETA: 46s - loss: 0.6738 - accuracy: 0.5983

KeyboardInterrupt: ignored

In [None]:
# Test DataFrame

test_df = pd.DataFrame(columns = encoded_columns)

print(test_df)

Empty DataFrame
Columns: [ID, main_category_Publishing, main_category_Film & Video, main_category_Music, main_category_Food, main_category_Crafts, main_category_Games, main_category_Design, main_category_Comics, main_category_Fashion, main_category_Theater, main_category_Art, main_category_Photography, main_category_Technology, main_category_Dance, main_category_Journalism, currency_GBP, currency_USD, currency_CAD, currency_AUD, currency_NOK, currency_EUR, currency_MXN, currency_SEK, currency_NZD, currency_CHF, currency_DKK, currency_HKD, currency_SGD, currency_JPY, goal, usd_goal_real, success, days, month_launched]
Index: []


In [None]:
encoded_columns

['ID',
 'main_category_Publishing',
 'main_category_Film & Video',
 'main_category_Music',
 'main_category_Food',
 'main_category_Crafts',
 'main_category_Games',
 'main_category_Design',
 'main_category_Comics',
 'main_category_Fashion',
 'main_category_Theater',
 'main_category_Art',
 'main_category_Photography',
 'main_category_Technology',
 'main_category_Dance',
 'main_category_Journalism',
 'currency_GBP',
 'currency_USD',
 'currency_CAD',
 'currency_AUD',
 'currency_NOK',
 'currency_EUR',
 'currency_MXN',
 'currency_SEK',
 'currency_NZD',
 'currency_CHF',
 'currency_DKK',
 'currency_HKD',
 'currency_SGD',
 'currency_JPY',
 'goal',
 'usd_goal_real',
 'success',
 'days',
 'month_launched']

In [None]:
main_categories = ['Publishing', 'Film & Video', 'Music', '

def model_prepare(dataframe):
