In [1]:
# imports
import pandas as pd
import os
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import category_encoders as ce
from sklearn.utils import resample

from helper_functions import load_data
from helper_functions import my_split
from helper_functions import upsample_minority
from helper_functions import downsample_majority

encoder = ce.one_hot.OneHotEncoder(use_cat_names=True)

In [3]:
# loading data
df = load_data()
df.head()

Unnamed: 0,name,desc,goal,disable_communication,country,currency,deadline,launched_at,final_status,campaign_length,launch_year,launch_month,launch_day,launch_weekday
0,drawing for dollars,I like drawing pictures. and then i color them...,20.0,0,US,USD,2009-05-03 02:59:59,2009-04-24 15:52:03,1,8,2009,4,24,4
1,Sponsor Dereck Blackburn (Lostwars) Artist in ...,"I, Dereck Blackburn will be taking upon an inc...",300.0,0,US,USD,2009-05-15 19:10:00,2009-04-28 23:26:32,0,17,2009,4,28,1
2,Mr. Squiggles,So I saw darkpony's successfully funded drawin...,30.0,0,US,USD,2009-05-22 17:26:00,2009-05-12 17:39:58,0,10,2009,5,12,1
3,Help me write my second novel.,Do your part to help out starving artists and ...,500.0,0,US,USD,2009-05-28 20:09:00,2009-04-28 20:58:50,1,30,2009,4,28,1
4,Support casting my sculpture in bronze,"I'm nearing completion on a sculpture, current...",2000.0,0,US,USD,2009-05-31 07:38:00,2009-05-01 08:22:21,0,30,2009,5,1,4


In [4]:
# splitting data into test and train
year = 2015
train, test = my_split(df, year)

In [5]:
# majority classifier
majority = train['final_status'].value_counts().index[0]

y_pred = len(test['final_status']) * [majority]
accuracy_score(test['final_status'], y_pred)

0.7691787003610109

In [6]:
# selecting features and target
features = ['goal', 
        'campaign_length', 
        'launch_month', 
        'launch_day', 
        'launch_weekday', 
        'disable_communication',
        'country', 
        'currency']

target = 'final_status'

In [7]:
# logistic regression - no change to class balance
X_train = train[features]
X_train_one_hot = encoder.fit_transform(X_train)

y_train = train[target]

X_test = test[features]
X_test_one_hot = encoder.transform(X_test)

y_test = test[target]


model = LogisticRegression(solver='lbfgs')

model.fit(X_train_one_hot, y_train)
model.score(X_test_one_hot, y_test)

0.7691787003610109

In [8]:
# the classes are pretty imbalanced which is affecting the model
train['final_status'].value_counts()

0    59932
1    30469
Name: final_status, dtype: int64

In [15]:
# logistic regression - upsampling minority class
train_upsampled = upsample_minority(train)

X_train = train_upsampled[features]
X_train_one_hot = encoder.fit_transform(X_train)

y_train = train_upsampled[target]


X_test = test[features]
X_test_one_hot = encoder.transform(X_test)

y_test = test[target]

model = LogisticRegression(solver='lbfgs')

model.fit(X_train_one_hot, y_train)
model.score(X_test_one_hot, y_test)

0.5402752707581228

In [16]:
# logistic regression - downsampling majority class
train_downsampled = downsample_majority(train)

X_train = train_downsampled[features]
X_train_one_hot = encoder.fit_transform(X_train)

y_train = train_downsampled[target]


X_test = test[features]
X_test_one_hot = encoder.transform(X_test)

y_test = test[target]

model = LogisticRegression(solver='lbfgs')

model.fit(X_train_one_hot, y_train)
model.score(X_test_one_hot, y_test)

0.5781814079422383