In [1]:
# import packages 
import pandas as pd
import numpy as np
import matplotlib
import glob
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

In [2]:
def load_kickstarter_data(datapath):
    '''datapath = location of csv files to be loaded'''
    # List with the names of all the csv files in the path
    csv_files = glob.glob(datapath+'/*.csv')

    print(f'Total files: {len(csv_files)}')

    # Loop through the files
    for file_idx, csv_file in enumerate(csv_files): 
        # create dataframe from 1st csv       
        if file_idx == 0:
            df_ks = pd.read_csv(csv_file)
            print(f'File number {file_idx + 1} added to dataframe')
        else:
            # create dataframe from idx csv
            df = pd.read_csv(csv_file)
            # check files are all in same
            if  np.all(df.columns == df_ks.columns) == False:
                print(f'Column format of {csv_file} does not match {csv_files[0]}. Please check and try again')
                return
            else:
                # append to initial dataframe                   
                df_ks = pd.concat([df_ks, df], axis=0, ignore_index=True)       
                print(f'File number {file_idx + 1} added to dataframe')
    print('File import done')
    return df_ks

In [3]:
def load_kickstarter_data_short(datapath):
    ''' 
    This is a version of the main function to load jsut 2 files for use in testing
    datapath = location of csv files to be loaded
    '''
    # List with the names of all the csv files in the path
    csv_files = glob.glob(datapath+'/*.csv')

    print(f'Total files: {len(csv_files)}')

    # Loop through the files
    for file_idx, csv_file in enumerate(csv_files): 
        # create dataframe from 1st csv       
        if file_idx == 0:
            df_ks = pd.read_csv(csv_file)
            print(f'File number {file_idx + 1} added to dataframe')
        else:
            # create dataframe from idx csv
            df = pd.read_csv(csv_file)
            # check files are all in same
            if  np.all(df.columns == df_ks.columns) == False:
                print(f'Column format of {csv_file} does not match {csv_files[0]}. Please check and try again')
                return
            else:
                # append to initial dataframe                   
                df_ks = pd.concat([df_ks, df], axis=0, ignore_index=True)       
                print(f'File number {file_idx + 1} added to dataframe')
                # This is here to prevent more than 2 files being loaded to save time in testing
                break
    print('File import done')
    return df_ks

In [4]:
def extract_json_data(data):
    ''' This function extracts specific sub fields from json files embedded in columns of a dataframe
        data: dataframe containing column with json data'''
    data['category_slug'] = pd.DataFrame([json.loads(data["category"][i])['slug'] for i in range(data.shape[0])])
    data['category_name'] = pd.DataFrame.from_dict([json.loads(data["category"][i])['name'] for i in range(data.shape[0])])
    data.drop(labels = 'category', axis=1, inplace=True)
    
# other Jsons are not parsable by this method
#    data['location_name'] = pd.DataFrame.from_dict([json.loads(data["location"][i])['name'] for i in range(data.shape[0])])
#    data['location_country'] = pd.DataFrame([json.loads(data["location"][i])['country'] for i in range(data.shape[0])])
#    data['creater_is_registered'] = pd.DataFrame([json.loads(data["creator"][i])['is_registered'] for i in range(data.shape[0])])
    print('json columns extracted')
    return data
# check output
#df2 = extract_json_data(df)
#df2.head()

In [5]:
def get_duration(data):
    #Convert from unix time stamp to more readable time format
    data['converted_deadline'] = pd.to_datetime(data['deadline'], unit='s')
    data['converted_launched_at'] = pd.to_datetime(data['launched_at'], unit='s')
    #Create project duration variable
    data['project_duration_days'] = (data['converted_deadline'] - data['converted_launched_at']).dt.days
    # Drop redundant columns
    data.drop(columns=['deadline', 'launched_at'], inplace=True)
    return data

In [6]:
def get_target(data,target='state', new_target_var='success', success_label='successful'):
    '''
    creates a dummy variable out of the state
    '''
    data[new_target_var] = data[target].apply(lambda x: 1 if x == success_label else 0)
    return data


In [21]:
def get_target_features(data, target_var='success'):
    '''
    Function that splits dataset into target and feature dataframes
    '''
#    data.head()
    target = data[target_var]
#    target.head()
#  data.drop([target_var,'state'], axis = 1, inplace=True)
    features_raw = data
#    features_raw.head()
#    data.head()
    print('target and features split is done')
    return target, features_raw

In [8]:
def scale_features(data, num_columns):
    ''' Initialize a scaler, then apply it to the features'''
    scaler = MinMaxScaler()
    data[num_columns] = scaler.fit_transform(data[num_columns])
    return data

In [9]:

def currency_conversion(data):
    #Convert the currency of all projects to USD
    data['usd_goal'] = data['goal'] * data['static_usd_rate']
    # drop goal and static_usd_rate to remove redundant data
    data.drop(columns=['goal','static_usd_rate'], inplace=True)
    return data


In [10]:
def drop_columns(data):
    '''remove unnecessary columns'''
    # Drop due to many missing values
    data.drop(columns=['friends', 'is_backing', 'is_starred', 'permissions'], inplace=True)
    # Some Json strings varariables with unusable or already used data
    data.drop(columns=['creator', 'location', 'photo', 'profile', 'slug', 'urls'], inplace=True)
    # Columns that are not specific to the campaign or are redundant or are technical data unrelated to campaign
    data.drop(columns=['created_at','currency', 'currency_symbol', 'currency_trailing_code', 
                     'current_currency', 'disable_communication',
                     'is_starrable', 'source_url', 'spotlight', 'staff_pick', 
                     'usd_type', 'state_changed_at','fx_rate'], inplace=True)
    # drop columns due to being linked to dependent variable which would not be known in advance
    data.drop(columns=['backers_count', 'converted_pledged_amount', 'pledged', 'usd_pledged','id',], inplace=True)
    # drop columns that are not used                
    data.drop(columns=['blurb', 'name', 'category_slug', 'converted_deadline', 'converted_launched_at'], inplace=True)
    return data

In [11]:
def test_train_split_kickstarter(features, target, test_size = 0.2, random_state = 0):
    '''
    Split data into train and test sets based on features and target dataframes.
    Shows results of the split and returns four dataframes
    '''
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = test_size, random_state = random_state)
    # Show the results of the split
    print ("Training set has {} samples.".format(X_train.shape[0]))
    print ("Testing set has {} samples.".format(X_test.shape[0]))
    return X_train, X_test, y_train, y_test

In [12]:
def train_using_gini(X_train, X_test, y_train): 
  
    # Creating the classifier object 
    clf_gini = DecisionTreeClassifier(criterion = "gini", 
            max_depth=3, min_samples_leaf=5) 
    # Performing training 
    clf_gini.fit(X_train, y_train) 
    return clf_gini 

In [13]:
# Function to make predictions 
def prediction(X_test, clf_object): 
  
    # Predicton on test data with model trained using either giniIndex / or entropy
    y_pred = clf_object.predict(X_test) 
    print("Predicted values:\n") 
    print(y_pred) 
    return y_pred 

In [14]:
def cal_accuracy(y_test, y_pred): 
      
    print("Confusion Matrix: \n", 
        confusion_matrix(y_test, y_pred)) 
      
    print ("Accuracy : \n", 
    accuracy_score(y_test,y_pred)*100) 
      
    print("Report : \n", 
    classification_report(y_test, y_pred)) 

In [24]:
""" MAIN SCRIPT ============================================================="""
if __name__ == '__main__':
    
    # Read data

    df = load_kickstarter_data_short('kickstarter/data')
    
    # Extract data from json
    df = extract_json_data(df)
    
    # drop campaigns that are still ongoing
    df = df[df.state != 'live']
    
    # convert unix timestamps and calculate campaign duration
    df = get_duration(df)
    
    #create goal data in single currency
    df = currency_conversion(df)
    
    # encode target variable 'state' to numerical values, success is 1 all others are fail and 0
    df = get_target(df,target='state', new_target_var='success', success_label='successful')

    # drop unnecessary columns
    df=drop_columns(df)
    df.head()

    # Split the data into features and target label
    target, features_raw = get_target_features(df)


    
    
    # split categorical columns into dummies
    features_raw = pd.get_dummies(features_raw)    

    # Clean and augment data
    # scale numerical features
#    num_columns = ['goal']
#    features_raw = scale_features(df, num_columns)
    
    # Split into training and test set
    X_train, X_test, y_train, y_test = test_train_split_kickstarter(features_raw, target, test_size = 0.2, random_state = 0)
    
    # Fit a simple decision tree first
    clf_gini = train_using_gini(X_train, X_test, y_train) 
    
    # Create predictions using simple model
    y_pred = prediction(X_test, clf_gini)
    
    # show results
    cal_accuracy(y_test, y_pred)
    
    # Creating the classifier object 


    # Fit Adaboost classifier using a decision tree as base estimator
    # Test with different number of iterations
    # Append the score output from each iteration of the adboost function
    
    # Compare error rate vs number of iterations
    #   plot_error_rate(er_train, er_test)

Total files: 56
File number 1 added to dataframe
File number 2 added to dataframe
File import done
json columns extracted
target and features split is done
Training set has 5854 samples.
Testing set has 1464 samples.
Predicted values:

[1 0 0 ... 1 0 0]
Confusion Matrix: 
 [[583   0]
 [  0 881]]
Accuracy : 
 100.0
Report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       583
           1       1.00      1.00      1.00       881

    accuracy                           1.00      1464
   macro avg       1.00      1.00      1.00      1464
weighted avg       1.00      1.00      1.00      1464



In [25]:
df.head()

Unnamed: 0,country,state,category_name,project_duration_days,usd_goal,success
1,US,successful,Playing Cards,30,1000.0,1
2,US,successful,Rock,30,15000.0,1
3,GB,failed,Playing Cards,59,12160.6569,0
4,US,successful,Nonfiction,30,2800.0,1
5,US,successful,Classical Music,30,3500.0,1


In [27]:
# Check shape of dataframes
print(df.columns)
print(df.shape)
print(X_test.shape)
print(y_test.shape)
print(X_train.shape)
print(y_train.shape)
print(y_train.dtype)

Index(['country', 'state', 'category_name', 'project_duration_days',
       'usd_goal', 'success'],
      dtype='object')
(7318, 6)
(1464, 187)
(1464,)
(5854, 187)
(5854,)
int64


Index(['country', 'category_slug', 'category_name', 'converted_deadline',
       'converted_launched_at', 'project_duration_days', 'usd_goal'],
      dtype='object')

In [42]:
df['success'] = df.state.apply(lambda x: True if x == 'successful' else False)
df['match'] = df['success'] == df['reached_goal']
df.query('state == "successful"').match.value_counts()

In [50]:
# Stuff for figureing out exchange rates
df['implied_fx_rate'] = df['usd_pledged'] / df['pledged']
df[['usd_pledged','implied_fx_rate','fx_rate','static_usd_rate']].head(10)
df['usd_goal'] = df['goal'] * df['static_usd_rate']
df['reached_goal'] = df['usd_goal'] < df['usd_pledged']
df['success'] = df.state.apply(lambda x: True if x == 'successful' else False)
df['match'] = df['success'] == df['reached_goal']
df.query('state == "successful" & usd_type == "international"').match.value_counts()

True     2084
False      53
Name: match, dtype: int64