In [1]:
import pandas as pd

In [12]:
df = pd.read_csv('ks-projects-201801.csv')

In [13]:
df.shape

(378661, 15)

In [14]:
df.head()
df.dtypes

ID                    int64
name                 object
category             object
main_category        object
currency             object
deadline             object
goal                float64
launched             object
pledged             float64
state                object
backers               int64
country              object
usd pledged         float64
usd_pledged_real    float64
usd_goal_real       float64
dtype: object

###### Prepare the target column

The *state* column show the outcome of the project

**Prepare target column:**
- Dropping projects that are live
- Counting "successful" states as outcome = 1
- Combining every other state as outcome = 0

In [15]:
#unique values
df['state'].unique()

array(['failed', 'canceled', 'successful', 'live', 'undefined',
       'suspended'], dtype=object)

In [6]:
df['state'].value_counts()

failed        197719
successful    133956
canceled       38779
undefined       3562
live            2799
suspended       1846
Name: state, dtype: int64

In [16]:
df = df[df.state != 'live' ]

In [17]:
df['state'].value_counts()

failed        197719
successful    133956
canceled       38779
undefined       3562
suspended       1846
Name: state, dtype: int64

In [18]:
df['state'] = df['state'].map({'successful':1, 'failed':0, 'canceled':0, 'undefined':0, 'suspended':0})

In [19]:
df['state'].value_counts()

0    241906
1    133956
Name: state, dtype: int64

X = df['launched'].map(lambda x: x.day)


###### Deal with dates

In [26]:
df['launched'] = pd.to_datetime(df['launched'])

In [34]:
df['day_launched'] = df['launched'].dt.day
df['month_launched'] = df['launched'].dt.month
df['year_launched'] = df['launched'].dt.year

In [44]:
df['deadline'] = pd.to_datetime(df['deadline'])

In [45]:
df['day_deadline'] = df['deadline'].dt.day
df['month_deadline'] = df['deadline'].dt.month
df['year_deadline'] = df['deadline'].dt.year

In [36]:
df = df.drop('launched', axis=1)

In [46]:
df = df.drop('deadline', axis=1)

###### Prep Categorical variables

In [47]:
import numpy as np
df_cat = df.select_dtypes(include=np.object)
df_cat.columns

Index(['name', 'category', 'main_category', 'currency', 'country'], dtype='object')

In [49]:
df = df.drop('name', axis=1)  # name feature does not add relevant information

In [54]:
from sklearn.preprocessing import LabelEncoder

df['category'].nunique(), df['main_category'].nunique(),df['currency'].nunique(), df['country'].nunique()


(159, 15, 14, 23)

In [57]:
encoder = LabelEncoder()

encoded = df_cat[['category', 'main_category', 'currency', 'country']].apply(encoder.fit_transform)

In [62]:
df[['category', 'main_category', 'currency', 'country']] = encoded

In [65]:
df = df.drop('ID', axis=1)


In [70]:
df = df.drop('usd pledged', axis=1)

In [61]:
df.columns

Index(['ID', 'category', 'main_category', 'currency', 'goal', 'pledged',
       'state', 'backers', 'country', 'usd pledged', 'usd_pledged_real',
       'usd_goal_real', 'day_launched', 'month_launched', 'year_launched',
       'day_deadline', 'month_deadline', 'year_deadline'],
      dtype='object')

###### Create training and test splits

In [66]:
from sklearn.model_selection import train_test_split

In [71]:
X = df.drop('state', axis=1)
y = df['state']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [72]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [76]:
y_pred = clf.predict(X_test)

from sklearn.metrics import accuracy_score, roc_auc_score

score = accuracy_score(y_test, y_pred)
score

0.9867698633450236

In [78]:
score = roc_auc_score(y_test, y_pred)
score


0.9879655074459077

In [80]:
df.to_csv('cleaned_data.csv', index=False)