A baseline is the result of a very basic model/solution. You generally create a baseline and then try to make more complex solutions in order to get a better result. If you achieve a better score than the baseline, it is good.


## Load the data

In [20]:
import pandas as pd
ks = pd.read_csv("ks-projects-201801-extra.csv",
                parse_dates=['deadline','launched'])
ks.head(6)

Unnamed: 0.1,Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,...,n_polysyllable_words,flesch_kincaid_grade_level,flesch_reading_ease,smog_index,gunning_fog_index,coleman_liau_index,automated_readability_index,lix,gulpease_index,wiener_sachtextformel
0,0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,...,1,5.24,66.4,8.841846,10.0,7.680995,4.62,45.0,99.0,7.057
1,1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,...,0,0.72,97.025,3.1291,1.6,3.996687,2.35375,29.0,117.75,0.5838
2,2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,...,0,-2.62,119.19,3.1291,1.2,-4.103777,-2.66,3.0,152.333333,-3.6434
3,3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,...,1,10.74,30.53,8.841846,8.514286,16.091526,11.002857,49.857143,70.428571,7.216829
4,4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,...,3,9.655,40.09,13.023867,18.2,17.249855,12.0075,58.0,64.0,12.1601
5,5,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,...,0,1.313333,90.99,3.1291,1.2,9.615875,8.33,69.666667,129.0,6.093267


In [21]:
print("Unique values in 'state' column:", list(ks.state.unique()))

Unique values in 'state' column: ['failed', 'canceled', 'successful', 'live', 'undefined', 'suspended']


## Prepare the target column

In [22]:
#dropping projects that are "live"
ks = ks.query('state!= "live"')
#Uma query é um pedido de uma informação ou de um dado

#Add outcome column, "successful" == 1, others are 0
ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))

## Covert timestamps

In [23]:
#convert the launched feature into a categorical features
ks = ks.assign(hour=ks.launched.dt.hour,
              day=ks.launched.dt.day,
              month=ks.launched.dt.month,
              year=ks.launched.dt.year)

## Prep categorical variables

In [24]:
from sklearn.preprocessing import LabelEncoder

cat_features = ['category','currency','country']
encoder = LabelEncoder()

encoded = ks[cat_features].apply(encoder.fit_transform)


In [25]:
data = ks[['goal','hour','day','month','year','outcome']].join(encoded)
data.head()

Unnamed: 0,goal,hour,day,month,year,outcome,category,currency,country
0,1000.0,12,11,8,2015,0,108,5,9
1,30000.0,4,2,9,2017,0,93,13,22
2,45000.0,0,12,1,2013,0,93,13,22
3,5000.0,3,17,3,2012,0,90,13,22
4,19500.0,8,4,7,2015,0,55,13,22


## Create training, validation and test splits

In [26]:
valid_fraction = 0.1
valid_size = int(len(data) * valid_fraction)

train = data[:-2 *valid_size]
valid = data[-2 * valid_size :-valid_size]
test = data[-valid_size:]

## Train a model
Tree-based model

In [28]:
import lightgbm as lgb

feature_cols = train.columns.drop('outcome')

dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

param = {'num_leaves': 64, 'objective': 'binary'}
param['metric'] = 'auc'
num_round = 1000
bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=10, verbose_eval=False)

# Make predictions & evaluate the model

In [37]:
#from sklearn import metrics
from sklearn.metrics import roc_auc_score
ypred = bst.predict(test[feature_cols])
score = metrics.roc_auc_score(test['outcome'], ypred)

print(f"Test AUC score: {score}")

Test AUC score: 0.747615303004287


In [39]:
dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])
dtest = lgb.Dataset(test[feature_cols], label=test['outcome'])

param = {'num_leaves': 64, 'objective': 'binary'}
param['metric'] = 'auc'
num_round = 1000
bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=10)

[1]	valid_0's auc: 0.694192
Training until validation scores don't improve for 10 rounds
[2]	valid_0's auc: 0.697026
[3]	valid_0's auc: 0.70002
[4]	valid_0's auc: 0.701645
[5]	valid_0's auc: 0.70601
[6]	valid_0's auc: 0.707926
[7]	valid_0's auc: 0.70945
[8]	valid_0's auc: 0.710437
[9]	valid_0's auc: 0.712047
[10]	valid_0's auc: 0.713417
[11]	valid_0's auc: 0.714648
[12]	valid_0's auc: 0.715791
[13]	valid_0's auc: 0.717431
[14]	valid_0's auc: 0.718216
[15]	valid_0's auc: 0.719381
[16]	valid_0's auc: 0.720884
[17]	valid_0's auc: 0.721617
[18]	valid_0's auc: 0.722789
[19]	valid_0's auc: 0.723307
[20]	valid_0's auc: 0.72501
[21]	valid_0's auc: 0.725721
[22]	valid_0's auc: 0.727384
[23]	valid_0's auc: 0.728268
[24]	valid_0's auc: 0.72865
[25]	valid_0's auc: 0.729141
[26]	valid_0's auc: 0.729552
[27]	valid_0's auc: 0.730459
[28]	valid_0's auc: 0.731047
[29]	valid_0's auc: 0.732472
[30]	valid_0's auc: 0.732801
[31]	valid_0's auc: 0.733166
[32]	valid_0's auc: 0.734182
[33]	valid_0's auc: 0.734