In [1]:
import pandas as pd
import numpy as np
import re
import datetime
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb

pd.set_option('display.max_colwidth',100)

In [2]:
#load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,project_id,name,desc,goal,keywords,disable_communication,country,currency,deadline,state_changed_at,created_at,launched_at,backers_count,final_status
0,kkst1451568084,drawing for dollars,I like drawing pictures. and then i color them too. so i thought i would suggest something for m...,20.0,drawing-for-dollars,False,US,USD,1241333999,1241334017,1240600507,1240602723,3,1
1,kkst1474482071,Sponsor Dereck Blackburn (Lostwars) Artist in Residence in Kankakee Illinois,"I, Dereck Blackburn will be taking upon an incredible journey in the month of May 2009. I will b...",300.0,sponsor-dereck-blackburn-lostwars-artist-in-residence-in-kankakee-illinois,False,US,USD,1242429000,1242432018,1240960224,1240975592,2,0
2,kkst183622197,Mr. Squiggles,"So I saw darkpony's successfully funded drawing for dollars project and I thought """"""""""""""""""""""""""""...",30.0,mr-squiggles,False,US,USD,1243027560,1243027818,1242163613,1242164398,0,0
3,kkst597742710,Help me write my second novel.,Do your part to help out starving artists and help me write my second novel!\r\n\r\nI have just ...,500.0,help-me-write-my-second-novel,False,US,USD,1243555740,1243556121,1240963795,1240966730,18,1
4,kkst1913131122,Support casting my sculpture in bronze,"I'm nearing completion on a sculpture, currently titled """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""...",2000.0,support-casting-my-sculpture-in-bronze,False,US,USD,1243769880,1243770317,1241177914,1241180541,1,0


In [4]:
# convert unix time format
unix_cols = ['deadline','state_changed_at','launched_at','created_at']

for x in unix_cols:
    train[x] = train[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))
    test[x] = test[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))

In [5]:
cols_to_use = ['name','desc']
len_feats = ['name_len','desc_len']
count_feats = ['name_count','desc_count']

for i in np.arange(2):
    train[len_feats[i]] = train[cols_to_use[i]].apply(str).apply(len)
    test[len_feats[i]] = test[cols_to_use[i]].apply(str).apply(len)

In [6]:
train.head()

Unnamed: 0,project_id,name,desc,goal,keywords,disable_communication,country,currency,deadline,state_changed_at,created_at,launched_at,backers_count,final_status,name_len,desc_len
0,kkst1451568084,drawing for dollars,I like drawing pictures. and then i color them too. so i thought i would suggest something for m...,20.0,drawing-for-dollars,False,US,USD,2009-05-03 12:29:59,2009-05-03 12:30:17,2009-04-25 00:45:07,2009-04-25 01:22:03,3,1,19,134
1,kkst1474482071,Sponsor Dereck Blackburn (Lostwars) Artist in Residence in Kankakee Illinois,"I, Dereck Blackburn will be taking upon an incredible journey in the month of May 2009. I will b...",300.0,sponsor-dereck-blackburn-lostwars-artist-in-residence-in-kankakee-illinois,False,US,USD,2009-05-16 04:40:00,2009-05-16 05:30:18,2009-04-29 04:40:24,2009-04-29 08:56:32,2,0,76,137
2,kkst183622197,Mr. Squiggles,"So I saw darkpony's successfully funded drawing for dollars project and I thought """"""""""""""""""""""""""""...",30.0,mr-squiggles,False,US,USD,2009-05-23 02:56:00,2009-05-23 03:00:18,2009-05-13 02:56:53,2009-05-13 03:09:58,0,0,13,385
3,kkst597742710,Help me write my second novel.,Do your part to help out starving artists and help me write my second novel!\r\n\r\nI have just ...,500.0,help-me-write-my-second-novel,False,US,USD,2009-05-29 05:39:00,2009-05-29 05:45:21,2009-04-29 05:39:55,2009-04-29 06:28:50,18,1,30,131
4,kkst1913131122,Support casting my sculpture in bronze,"I'm nearing completion on a sculpture, currently titled """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""...",2000.0,support-casting-my-sculpture-in-bronze,False,US,USD,2009-05-31 17:08:00,2009-05-31 17:15:17,2009-05-01 17:08:34,2009-05-01 17:52:21,1,0,38,384


In [7]:
train['name_count'] = train['name'].str.split().str.len()
train['desc_count'] = train['desc'].str.split().str.len()

test['name_count'] = test['name'].str.split().str.len()
test['desc_count'] = test['desc'].str.split().str.len()

In [8]:
train.head()

Unnamed: 0,project_id,name,desc,goal,keywords,disable_communication,country,currency,deadline,state_changed_at,created_at,launched_at,backers_count,final_status,name_len,desc_len,name_count,desc_count
0,kkst1451568084,drawing for dollars,I like drawing pictures. and then i color them too. so i thought i would suggest something for m...,20.0,drawing-for-dollars,False,US,USD,2009-05-03 12:29:59,2009-05-03 12:30:17,2009-04-25 00:45:07,2009-04-25 01:22:03,3,1,19,134,3.0,26.0
1,kkst1474482071,Sponsor Dereck Blackburn (Lostwars) Artist in Residence in Kankakee Illinois,"I, Dereck Blackburn will be taking upon an incredible journey in the month of May 2009. I will b...",300.0,sponsor-dereck-blackburn-lostwars-artist-in-residence-in-kankakee-illinois,False,US,USD,2009-05-16 04:40:00,2009-05-16 05:30:18,2009-04-29 04:40:24,2009-04-29 08:56:32,2,0,76,137,10.0,25.0
2,kkst183622197,Mr. Squiggles,"So I saw darkpony's successfully funded drawing for dollars project and I thought """"""""""""""""""""""""""""...",30.0,mr-squiggles,False,US,USD,2009-05-23 02:56:00,2009-05-23 03:00:18,2009-05-13 02:56:53,2009-05-13 03:09:58,0,0,13,385,2.0,22.0
3,kkst597742710,Help me write my second novel.,Do your part to help out starving artists and help me write my second novel!\r\n\r\nI have just ...,500.0,help-me-write-my-second-novel,False,US,USD,2009-05-29 05:39:00,2009-05-29 05:45:21,2009-04-29 05:39:55,2009-04-29 06:28:50,18,1,30,131,6.0,24.0
4,kkst1913131122,Support casting my sculpture in bronze,"I'm nearing completion on a sculpture, currently titled """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""...",2000.0,support-casting-my-sculpture-in-bronze,False,US,USD,2009-05-31 17:08:00,2009-05-31 17:15:17,2009-05-01 17:08:34,2009-05-01 17:52:21,1,0,38,384,6.0,18.0


In [9]:
train['keywords_len'] = train['keywords'].str.len()
train['keywords_count'] = train['keywords'].str.split('-').str.len()

test['keywords_len'] = test['keywords'].str.len()
test['keywords_count'] = test['keywords'].str.split('-').str.len()

In [10]:
# converting string variables to datetime
unix_cols = ['deadline','state_changed_at','launched_at','created_at']

for x in unix_cols:
    train[x] = train[x].apply(lambda k: datetime.datetime.strptime(k, '%Y-%m-%d %H:%M:%S'))
    test[x] = test[x].apply(lambda k: datetime.datetime.strptime(k, '%Y-%m-%d %H:%M:%S'))

In [20]:
# there should be simpler way - might take longer
# creating list with time difference between 1) launched_at and created_at 2) deadline and launched_at

time1 = []
time3 = []
time4 = []
time5 = []
for i in np.arange(train.shape[0]):
    time1.append(np.round((train.loc[i, 'launched_at'] - train.loc[i, 'created_at']).total_seconds()).astype(int))
    time3.append(np.round((train.loc[i, 'deadline'] - train.loc[i, 'launched_at']).total_seconds()).astype(int))
    time4.append(np.round((train.loc[i, 'state_changed_at'] - train.loc[i, 'created_at']).total_seconds()).astype(int))
    time5.append(np.round((train.loc[i, 'state_changed_at'] - train.loc[i, 'deadline']).total_seconds()).astype(int))    

In [21]:
train['time1'] = np.log(time1)
train['time3'] = np.log(time3)
train['time4'] = np.log(time4)
train['time5'] = np.log(time5)

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


In [24]:
# for test data
time6 = []
time7 = []
time8 = []
time9 = []
for i in np.arange(test.shape[0]):
    time6.append(np.round((test.loc[i, 'launched_at'] - test.loc[i, 'created_at']).total_seconds()).astype(int))
    time7.append(np.round((test.loc[i, 'deadline'] - test.loc[i, 'launched_at']).total_seconds()).astype(int))
    time8.append(np.round((test.loc[i, 'state_changed_at'] - test.loc[i, 'created_at']).total_seconds()).astype(int))
    time9.append(np.round((test.loc[i, 'state_changed_at'] - test.loc[i, 'deadline']).total_seconds()).astype(int))

In [25]:
test['time1'] = np.log(time6)
test['time3'] = np.log(time7)
test['time4'] = np.log(time8)
test['time5'] = np.log(time9)

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


In [23]:
train.tail()

Unnamed: 0,project_id,name,desc,goal,keywords,disable_communication,country,currency,deadline,state_changed_at,...,name_len,desc_len,name_count,desc_count,keywords_len,keywords_count,time1,time3,time4,time5
108124,kkst542772509,Custom Wearable Art,Sacred geometry art and custom clothing. Embroidered hats and accessories. Mixed media designs. ...,2000.0,custom-wearable-art,False,US,USD,2015-05-31 23:37:31,2015-05-31 23:37:34,...,19,135,3.0,19.0,19,3,13.930039,14.76794,15.127444,1.098612
108125,kkst615785942,Painted Modern Wall art Mid-century! $100 for originals!,"I want to create pieces very similar to this one (Striped). Intense, vibrant, mid-century/conte...",15000.0,zacharys-art-project,False,US,USD,2015-05-31 23:44:00,2015-05-31 23:44:01,...,57,131,8.0,19.0,20,3,8.339262,15.374065,15.374945,0.0
108126,kkst1421442073,Creating High Quality Art Prints from the Artist Herron,I am seeking to produce prints to promote my recent collection of art work and have a starting p...,320.0,creating-high-quality-art-prints-from-the-artist-h,False,US,USD,2015-05-31 23:48:54,2015-05-31 23:48:57,...,55,121,9.0,23.0,50,9,11.33583,14.76794,14.799749,1.098612
108127,kkst2022543055,back to basics,A self sustaining homestead with organic produce and free range animals...pay it forward>>>,35000.0,back-to-basics-2,False,US,USD,2015-05-31 23:53:00,2015-05-31 23:53:04,...,14,91,3.0,13.0,16,4,12.763208,14.799405,14.922088,1.386294
108128,kkst169301870,"Happy Olive, Mediterranean Wraps","Mediterranean Wraps, done right!",25000.0,happy-olive-mediterranean-wraps,False,US,USD,2015-05-31 23:58:58,2015-05-31 23:59:00,...,32,32,4.0,4.0,31,4,11.074514,15.173405,15.189861,0.693147


In [26]:
feat = ['disable_communication','country']

for x in feat:
    le = LabelEncoder()
    le.fit(list(train[x].values) + list(test[x].values))
    train[x] = le.transform(list(train[x]))
    test[x] = le.transform(list(test[x]))

In [27]:
train['goal'] = np.log1p(train['goal'])
test['goal'] = np.log1p(test['goal'])

In [28]:
# creating a full list of descriptions from train and etst
kickdesc = pd.Series(train['desc'].tolist() + test['desc'].tolist()).astype(str)

In [29]:
# this function cleans punctuations, digits and irregular tabs. Then converts the sentences to lower
def desc_clean(word):
    p1 = re.sub(pattern='(\W+)|(\d+)|(\s+)',repl=' ',string=word)
    p1 = p1.lower()
    return p1

kickdesc = kickdesc.map(desc_clean)

In [30]:
stop = set(stopwords.words('english'))
kickdesc = [[x for x in x.split() if x not in stop] for x in kickdesc]

stemmer = SnowballStemmer(language='english')
kickdesc = [[stemmer.stem(x) for x in x] for x in kickdesc]

kickdesc = [[x for x in x if len(x) > 2] for x in kickdesc]

kickdesc = [' '.join(x) for x in kickdesc]

In [31]:
# Due to memory error, limited the number of features to 650
cv = CountVectorizer(max_features=650)

In [32]:
alldesc = cv.fit_transform(kickdesc).todense()

In [33]:
#create a data frame
combine = pd.DataFrame(alldesc)
combine.rename(columns= lambda x: 'variable_'+ str(x), inplace=True)

In [34]:
#split the text features

train_text = combine[:train.shape[0]]
test_text = combine[train.shape[0]:]

test_text.reset_index(drop=True,inplace=True)

In [35]:
cols_to_use = ['name_len','desc_len','keywords_len','name_count','desc_count','keywords_count','time1','time3','time4','time5','goal']

In [36]:
target = train['final_status']

In [37]:
train = train.loc[:,cols_to_use]
test = test.loc[:,cols_to_use]

In [38]:
X_train = pd.concat([train, train_text],axis=1)
X_test = pd.concat([test, test_text],axis=1)

In [39]:
print (X_train.shape)
print (X_test.shape)

(108129, 661)
(63465, 661)


In [40]:
dtrain = xgb.DMatrix(data=X_train, label = target)
dtest = xgb.DMatrix(data=X_test)

In [41]:
params = {
    'objective':'binary:logistic',
    'eval_metric':'error',
    'eta':0.025,
    'max_depth':6,
    'subsample':0.7,
    'colsample_bytree':0.7,
    'min_child_weight':5
    
}

In [42]:
# You can probably get better accuracy with rounds > 1000. 
bst = xgb.cv(params, dtrain, num_boost_round=1200, early_stopping_rounds=40,verbose_eval=10)

[0]	train-error:0.310481+0.00134967	test-error:0.313524+0.00186397
[10]	train-error:0.310079+0.00142467	test-error:0.312756+0.00341107
[20]	train-error:0.308881+0.000741766	test-error:0.311692+0.00319392
[30]	train-error:0.307212+0.00133939	test-error:0.310583+0.00249889
[40]	train-error:0.304923+0.000984707	test-error:0.308428+0.00283758
[50]	train-error:0.303189+0.00110169	test-error:0.307346+0.00264051
[60]	train-error:0.301131+0.00134111	test-error:0.305727+0.00249001
[70]	train-error:0.299545+0.00124023	test-error:0.304377+0.00249008
[80]	train-error:0.297811+0.00100372	test-error:0.303267+0.00261035
[90]	train-error:0.296202+0.00100073	test-error:0.302398+0.00289145
[100]	train-error:0.294639+0.00100005	test-error:0.301482+0.00253743
[110]	train-error:0.293344+0.00101025	test-error:0.300669+0.00258567
[120]	train-error:0.292137+0.000928906	test-error:0.299355+0.00235569
[130]	train-error:0.290953+0.00107843	test-error:0.29807+0.00240764
[140]	train-error:0.29008+0.00142581	test-e

In [43]:
bst_train = xgb.train(params, dtrain, num_boost_round=1200)

In [44]:
p_test = bst_train.predict(dtest)

In [45]:
p_test

array([ 0.23220977,  0.08888924,  0.12274242, ...,  0.13250966,
        0.09850245,  0.37319836], dtype=float32)

In [46]:
test.head()

Unnamed: 0,name_len,desc_len,keywords_len,name_count,desc_count,keywords_count,time1,time3,time4,time5,goal
0,9,119,8,1,19.0,1,11.964027,15.056664,15.101046,-inf,8.853808
1,16,119,16,2,20.0,2,11.646556,14.922091,14.959191,-inf,10.463132
2,43,126,43,7,19.0,7,11.416106,14.949674,14.978456,-inf,10.809748
3,58,108,49,8,17.0,6,14.195786,14.76794,15.215387,2.484907,10.59666
4,18,132,18,3,20.0,3,14.759266,15.132977,15.656626,1.386294,9.903538


In [47]:
sub = pd.DataFrame()
test = pd.read_csv('test.csv')
sub['project_id'] = test['project_id']
sub['final_status'] = p_test

sub['final_status'] = [1 if x > 0.5 else 0 for x in sub['final_status']]

sub.to_csv("xgb_with_python_feats.csv",index=False) #0.70