In [1]:
# imports
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM
import numpy as np

from helper_functions import load_data
from helper_functions import my_split
from helper_functions import upsample_minority
from helper_functions import downsample_majority
from helper_functions import model_prep
from helper_functions import get_results
from helper_functions import get_f1
from helper_functions import clean_text

In [14]:
# loading the data
df = load_data()
df.head()

Unnamed: 0,name,desc,goal,disable_communication,country,currency,deadline,launched_at,final_status,campaign_length,launch_year,launch_month,launch_day,launch_weekday
0,drawing for dollars,I like drawing pictures. and then i color them...,20.0,0,US,USD,2009-05-03 02:59:59,2009-04-24 15:52:03,1,8,2009,4,24,4
1,Sponsor Dereck Blackburn (Lostwars) Artist in ...,"I, Dereck Blackburn will be taking upon an inc...",300.0,0,US,USD,2009-05-15 19:10:00,2009-04-28 23:26:32,0,17,2009,4,28,1
2,Mr. Squiggles,So I saw darkpony's successfully funded drawin...,30.0,0,US,USD,2009-05-22 17:26:00,2009-05-12 17:39:58,0,10,2009,5,12,1
3,Help me write my second novel.,Do your part to help out starving artists and ...,500.0,0,US,USD,2009-05-28 20:09:00,2009-04-28 20:58:50,1,30,2009,4,28,1
4,Support casting my sculpture in bronze,"I'm nearing completion on a sculpture, current...",2000.0,0,US,USD,2009-05-31 07:38:00,2009-05-01 08:22:21,0,30,2009,5,1,4


In [3]:
# setting variables
batch_size = 32
max_features = 20000
features = 'name'
target = 'final_status'

In [15]:
# cleaning the data
df[features] = df[features].fillna('')
df[features] = df[features].apply(lambda x: clean_text(x))

In [16]:
# train/test split
year = 2015
train, test = my_split(df, year)

In [17]:
# transforming words to integer values
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train[features])

train[features] = tokenizer.texts_to_sequences(train[features])
test[features] = tokenizer.texts_to_sequences(test[features])

# processing data
X_train, y_train, X_test, y_test = model_prep(train, test, features, target, onehot=False, scale=False)
maxlen = max([len(each) for each in train[features]])

# padding sequences to all be the same length
X_train = sequence.pad_sequences(X_train, maxlen=maxlen, padding='post')
X_test = sequence.pad_sequences(X_test, maxlen=maxlen, padding='post')

# instantiating the model
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# compiling the model
model.compile(loss='binary_crossentropy',
              optimizer='adam', 
              metrics=['accuracy', get_f1])

# fitting the model
history = model.fit(X_train, y_train,
          batch_size=batch_size, 
          epochs=5, 
          validation_data=(X_test,y_test))

Epoch 1/5
 277/2826 [=>............................] - ETA: 14:07 - loss: 0.6412 - accuracy: 0.6646 - get_f1: 0.0000e+00

KeyboardInterrupt: 