## Data Exploration

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('train.csv')
train, test = train_test_split(df)
train.shape, test.shape

((81096, 14), (27033, 14))

In [2]:
train.head(2)

Unnamed: 0,project_id,name,desc,goal,keywords,disable_communication,country,currency,deadline,state_changed_at,created_at,launched_at,backers_count,final_status
78215,kkst1736959519,"Get Away Soul """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""...",Let's get away!,500.0,get-away-soul-ep-choyce,False,US,USD,1413817697,1413817697,1409926748,1409929697,4,0
36111,kkst1783714601,DreamHopping,This is a website that lets you submit dreams ...,20000.0,dreamhopping,False,US,USD,1364085415,1364085415,1361242183,1361497015,9,0


In [3]:
test.head(1)

Unnamed: 0,project_id,name,desc,goal,keywords,disable_communication,country,currency,deadline,state_changed_at,created_at,launched_at,backers_count,final_status
78668,kkst1972708442,U District Parklet,"Help us build the U District's first Parklet, ...",6000.0,u-district-parklet,False,US,USD,1414172751,1414172751,1395722084,1411580751,107,1


In [4]:
train = train.drop(columns=['project_id', 'state_changed_at', 'launched_at', 'backers_count'])
test = test.drop(columns=['project_id', 'state_changed_at', 'launched_at', 'backers_count'])

In [5]:
train[['name', 'keywords']].head(10)

Unnamed: 0,name,keywords
78215,"Get Away Soul """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""...",get-away-soul-ep-choyce
36111,DreamHopping,dreamhopping
19662,"Old School Jazz, Funk Band...""""""""""""""""""""""""""""""""""...",old-school-jazz-funk-bandthe-tonze
82839,Science All Around Us with Collin Keegan,science-all-around-us-with-collin-keegan
42005,Meridian Miniatures Steampunk Army,meridian-miniatures-steampunk-army
87741,"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""...",the-naked-pixel-fine-art-nudes-collection-2nd-...
83577,Upstate Craft Beer Co. (Phase 1 - Fermenters),upstate-craft-beer-co-phase-1-fermenters
80513,Dying of thirst: There is a cold one in the back.,dying-of-thirst
34794,"Danny Dierks and the Heir of Pendragon, Young ...",danny-dierks-and-the-heir-of-pendragon-young-a...
102025,Imaginal Jeans (Kosmolupo),imaginal-jeans-kosmolupo


In [6]:
train['disable_communication'].value_counts()

False    80854
True       242
Name: disable_communication, dtype: int64

In [7]:
train.head(1)

Unnamed: 0,name,desc,goal,keywords,disable_communication,country,currency,deadline,created_at,final_status
78215,"Get Away Soul """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""...",Let's get away!,500.0,get-away-soul-ep-choyce,False,US,USD,1413817697,1409926748,0


In [8]:
train['duration'] = (train['deadline'] - train['created_at']) / (3600 * 24)
test['duration'] = (test['deadline'] - test['created_at']) / (3600 * 24)
train['duration'][:5]

78215    45.034132
36111    32.907778
19662    59.858831
82839    49.218299
42005    36.006713
Name: duration, dtype: float64

In [9]:
train['currency'].value_counts()

USD    69086
GBP     6569
CAD     2743
AUD     1399
EUR      625
NZD      267
SEK      175
DKK      149
NOK       83
Name: currency, dtype: int64

In [10]:
train['country'].value_counts()

US    69086
GB     6569
CA     2743
AU     1399
NL      541
NZ      267
SE      175
DK      149
IE       83
NO       83
DE        1
Name: country, dtype: int64

In [11]:
train.head(1)

Unnamed: 0,name,desc,goal,keywords,disable_communication,country,currency,deadline,created_at,final_status,duration
78215,"Get Away Soul """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""...",Let's get away!,500.0,get-away-soul-ep-choyce,False,US,USD,1413817697,1409926748,0,45.034132


In [12]:
train['final_status'].value_counts()

0    55229
1    25867
Name: final_status, dtype: int64

## Make Target and Features

In [13]:
X_train = train.drop(columns=['name', 'deadline', 'created_at', 'final_status', 'desc', 'keywords'])
y_train = train['final_status']

X_test = test.drop(columns=['name', 'deadline', 'created_at', 'final_status', 'desc', 'keywords'])
y_test = test['final_status']

In [14]:
X_train.head()

Unnamed: 0,goal,disable_communication,country,currency,duration
78215,500.0,False,US,USD,45.034132
36111,20000.0,False,US,USD,32.907778
19662,10000.0,False,US,USD,59.858831
82839,35000.0,False,US,USD,49.218299
42005,6000.0,False,GB,GBP,36.006713


In [15]:
X_train['disable_communication'] = X_train['disable_communication'].replace(False, 0)
X_train['disable_communication'] = X_train['disable_communication'].replace(True, 1)

X_test['disable_communication'] = X_test['disable_communication'].replace(False, 0)
X_test['disable_communication'] = X_test['disable_communication'].replace(True, 1)

In [16]:
X_train.head()

Unnamed: 0,goal,disable_communication,country,currency,duration
78215,500.0,0.0,US,USD,45.034132
36111,20000.0,0.0,US,USD,32.907778
19662,10000.0,0.0,US,USD,59.858831
82839,35000.0,0.0,US,USD,49.218299
42005,6000.0,0.0,GB,GBP,36.006713


In [17]:
!pip install category_encoders

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/44/57/fcef41c248701ee62e8325026b90c432adea35555cbc870aff9cfba23727/category_encoders-2.2.2-py2.py3-none-any.whl (80kB)
[K     |████                            | 10kB 17.2MB/s eta 0:00:01[K     |████████▏                       | 20kB 5.7MB/s eta 0:00:01[K     |████████████▏                   | 30kB 7.4MB/s eta 0:00:01[K     |████████████████▎               | 40kB 7.6MB/s eta 0:00:01[K     |████████████████████▎           | 51kB 6.3MB/s eta 0:00:01[K     |████████████████████████▍       | 61kB 6.9MB/s eta 0:00:01[K     |████████████████████████████▍   | 71kB 7.4MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 4.6MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


In [18]:
import category_encoders as ce

encoder = ce.OneHotEncoder(use_cat_names=True)

X_train_enc = encoder.fit_transform(X_train)
X_test_enc = encoder.transform(X_test)

  import pandas.util.testing as tm


In [19]:
X_train_enc.head()

Unnamed: 0,goal,disable_communication,country_US,country_GB,country_NL,country_CA,country_AU,country_NZ,country_SE,country_IE,country_DK,country_NO,country_DE,currency_USD,currency_GBP,currency_EUR,currency_CAD,currency_AUD,currency_NZD,currency_SEK,currency_DKK,currency_NOK,duration
78215,500.0,0.0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,45.034132
36111,20000.0,0.0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,32.907778
19662,10000.0,0.0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,59.858831
82839,35000.0,0.0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,49.218299
42005,6000.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,36.006713


In [20]:
from sklearn.preprocessing import Normalizer

normalizer = Normalizer()

X_train_norm = normalizer.fit_transform(X_train_enc)
X_test_norm = normalizer.transform(X_test_enc)

## NN Model

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [22]:
X_train_norm.shape, X_test_norm.shape

((81096, 23), (27033, 23))

In [23]:
model = Sequential()
model.add(Dense(10, activation='sigmoid', input_dim=23))
model.add(Dense(20, activation='sigmoid'))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))

In [24]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [25]:
model.fit(X_train_norm, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f8a15901518>

In [26]:
y_train.value_counts(normalize=True)

0    0.681032
1    0.318968
Name: final_status, dtype: float64

In [27]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train_norm, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [28]:
NN_results = model.predict(X_test_norm)

In [29]:
LR_result = lr.predict(X_test_norm)

In [30]:
NN_results[:5]

array([[0.34514886],
       [0.32788542],
       [0.24822083],
       [0.3424795 ],
       [0.3360466 ]], dtype=float32)

In [31]:
NN_results = pd.DataFrame(NN_results)
LR_results = pd.DataFrame(LR_result)

In [32]:
LR_results[0].value_counts(normalize=True)

0    0.97799
1    0.02201
Name: 0, dtype: float64

## NLP Model

In [33]:
NLP_train = train[['desc']]
NLP_test = test[['desc']]

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [35]:
NLP_train['desc'] = NLP_train['desc'].apply(lambda x: str(x))
NLP_test['desc'] = NLP_test['desc'].apply(lambda x: str(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [36]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

dtm = tfidf.fit_transform(NLP_train['desc'])

dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

In [37]:

test_dtm = tfidf.transform(NLP_test['desc'])

test_dtm = pd.DataFrame(test_dtm.todense(), columns=tfidf.get_feature_names())

In [38]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()

log_reg.fit(dtm, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [39]:
results = log_reg.predict(test_dtm)

In [40]:
results = pd.Series(results)

In [41]:
results.value_counts(normalize=True)

0    0.865535
1    0.134465
dtype: float64

## NLP RNN

In [42]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM

In [43]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora

In [44]:
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

In [45]:
NLP_train['tokens'] = NLP_train['desc'].apply(tokenize)
NLP_test['tokens'] = NLP_test['desc'].apply(tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [46]:
NLP_train.head()

Unnamed: 0,desc,tokens
78215,Let's get away!,"[let, away]"
36111,This is a website that lets you submit dreams ...,"[website, lets, submit, dreams, matched, users..."
19662,"Old School R&B, Jazz Funk Band","[old, school, jazz, funk, band]"
82839,An educational series created by and starring ...,"[educational, series, created, starring, ambit..."
42005,Funding the production of a full 28mm miniatur...,"[funding, production, mm, miniature, army, fit..."


In [47]:
tokens = pd.concat((NLP_train['tokens'], NLP_test['tokens']))

In [48]:
id2word = corpora.Dictionary(tokens)

In [49]:
id2word.token2id['color']

804

In [50]:
def word_to_num(row):
  return [id2word.token2id[word] for word in row]

In [51]:
NLP_train['data'] = NLP_train['tokens'].apply(word_to_num)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [52]:
NLP_test['data'] = NLP_test['tokens'].apply(word_to_num)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [53]:
NLP_train['data'][:5]

78215                                      [1, 0]
36111                       [8, 4, 6, 3, 5, 7, 2]
19662                         [12, 13, 11, 10, 9]
82839        [17, 19, 16, 20, 14, 22, 15, 18, 21]
42005    [25, 29, 28, 27, 23, 24, 32, 30, 31, 26]
Name: data, dtype: object

In [54]:

maxlen = 30

X_train = sequence.pad_sequences(NLP_train['data'], maxlen=maxlen)
X_test = sequence.pad_sequences(NLP_test['data'], maxlen=maxlen)

In [55]:
len(X_train), len(y_train)

(81096, 81096)

In [62]:
model = Sequential()

model.add(Embedding(81096, 128))
model.add(LSTM(128))
model.add(Dense(128))
model.add(Dense(64))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam', 
              metrics=['accuracy'])

In [65]:
model.fit(X_train, y_train, epochs=10, batch_size=512, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f89fbb58630>

In [66]:
model.save('saved_model.pkl')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: saved_model.pkl/assets


In [64]:
model.evaluate(X_test, y_test)



[0.6280802488327026, 0.6783930659294128]

In [60]:
import numpy as np
result = model.predict(X_test[0])
np.mean(result)

0.47592577

In [166]:
type(NLP_train['data'][0][0])

int

In [154]:
import pickle
filename = 'model.pkl'
pickle.dumps(model)

TypeError: ignored

In [61]:
from sklearn.externals import joblib 
joblib.dump(model, 'model.pkl') 



TypeError: ignored