In [1]:
# import packages 
import pandas as pd
import numpy as np
import matplotlib
import glob
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

In [2]:
def load_kickstarter_data(datapath):
    '''datapath = location of csv files to be loaded'''
    # List with the names of all the csv files in the path
    csv_files = glob.glob(datapath+'/*.csv')

    print(f'Total files: {len(csv_files)}')

    # Loop through the files
    for file_idx, csv_file in enumerate(csv_files): 
        # create dataframe from 1st csv       
        if file_idx == 0:
            df_ks = pd.read_csv(csv_file)
            print(f'File number {file_idx + 1} added to dataframe')
        else:
            # create dataframe from idx csv
            df = pd.read_csv(csv_file)
            # check files are all in same
            if  np.all(df.columns == df_ks.columns) == False:
                print(f'Column format of {csv_file} does not match {csv_files[0]}. Please check and try again')
                return
            else:
                # append to initial dataframe                   
                df_ks = pd.concat([df_ks, df], axis=0, ignore_index=True)       
                print(f'File number {file_idx + 1} added to dataframe')
    print('File import done')
    return df_ks

In [3]:
def load_kickstarter_data_short(datapath):
    ''' 
    This is a version of the main function to load jsut 2 files for use in testing
    datapath = location of csv files to be loaded
    '''
    # List with the names of all the csv files in the path
    csv_files = glob.glob(datapath+'/*.csv')

    print(f'Total files: {len(csv_files)}')

    # Loop through the files
    for file_idx, csv_file in enumerate(csv_files): 
        # create dataframe from 1st csv       
        if file_idx == 0:
            df_ks = pd.read_csv(csv_file)
            print(f'File number {file_idx + 1} added to dataframe')
        else:
            # create dataframe from idx csv
            df = pd.read_csv(csv_file)
            # check files are all in same
            if  np.all(df.columns == df_ks.columns) == False:
                print(f'Column format of {csv_file} does not match {csv_files[0]}. Please check and try again')
                return
            else:
                # append to initial dataframe                   
                df_ks = pd.concat([df_ks, df], axis=0, ignore_index=True)       
                print(f'File number {file_idx + 1} added to dataframe')
                # This is here to prevent more than 2 files being loaded to save time in testing
                break
    print('File import done')
    return df_ks

In [4]:
def extract_json_data(data):
    ''' This function extracts specific sub fields from json files embedded in columns of a dataframe
        data: dataframe containing column with json data'''
    data['category_slug'] = pd.DataFrame([json.loads(data["category"][i])['slug'] for i in range(data.shape[0])])
    data['category_name'] = pd.DataFrame.from_dict([json.loads(data["category"][i])['name'] for i in range(data.shape[0])])
    data = data.drop('category', axis=1, inplace=True)
    
# other Jsons are not parsable by this method
#    data['location_name'] = pd.DataFrame.from_dict([json.loads(data["location"][i])['name'] for i in range(data.shape[0])])
#    data['location_country'] = pd.DataFrame([json.loads(data["location"][i])['country'] for i in range(data.shape[0])])
#    data['creater_is_registered'] = pd.DataFrame([json.loads(data["creator"][i])['is_registered'] for i in range(data.shape[0])])
    print('json columns extracted')
    return data
# check output
#df2 = extract_json_data(df)
#df2.head()

In [5]:
def get_target_features(data):
    '''
    Function that splits dataset into target and feature dataframes
    '''
    target = data['state']
    data.drop('state', axis = 1, inplace=True)
    features_raw = data
    print('target and features split is done')
    return target, features_raw

In [30]:
df.columns

Index(['backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'created_at', 'creator', 'currency', 'currency_symbol',
       'currency_trailing_code', 'current_currency', 'deadline',
       'disable_communication', 'friends', 'fx_rate', 'goal', 'id',
       'is_backing', 'is_starrable', 'is_starred', 'launched_at', 'location',
       'name', 'permissions', 'photo', 'pledged', 'profile', 'slug',
       'source_url', 'spotlight', 'staff_pick', 'state', 'state_changed_at',
       'static_usd_rate', 'urls', 'usd_pledged', 'usd_type', 'target'],
      dtype='object')

In [34]:
#def create_currency
df['implied_fx_rate'] = df['usd_pledged'] / df['pledged']


In [36]:
df[['usd_pledged','implied_fx_rate','fx_rate','static_usd_rate']].head(10)

Unnamed: 0,usd_pledged,implied_fx_rate,fx_rate,static_usd_rate
0,28645.0,1.0,1.0,1.0
1,1950.0,1.0,1.0,1.0
2,22404.0,1.0,1.0,1.0
3,165.384934,1.216066,1.308394,1.216066
4,2820.0,1.0,1.0,1.0
5,3725.0,1.0,1.0,1.0
6,3890.0,1.0,1.0,1.0
7,660.0,1.0,1.0,1.0
8,529.786729,1.412765,1.308394,1.412765
9,2516.160602,1.287697,1.308394,1.287697


In [50]:
df['usd_goal'] = df['goal'] * df['static_usd_rate']
df['reached_goal'] = df['usd_goal'] < df['usd_pledged']
df['success'] = df.state.apply(lambda x: True if x == 'successful' else False)
df['match'] = df['success'] == df['reached_goal']
df.query('state == "successful" & usd_type == "international"').match.value_counts()

True     2084
False      53
Name: match, dtype: int64

In [42]:
df['success'] = df.state.apply(lambda x: True if x == 'successful' else False)
df['match'] = df['success'] == df['reached_goal']
df.query('state == "successful"').match.value_counts()

True     4202
False     104
Name: match, dtype: int64

In [6]:
def scale_features(data, num_columns):
    ''' Initialize a scaler, then apply it to the features'''
    scaler = MinMaxScaler()
    data[num_columns] = scaler.fit_transform(data[num_columns])
    return data

In [7]:
def test_train_split_kickstarter(features, target, test_size = 0.2, random_state = 0):
    '''
    Split data into train and test sets based on features and target dataframes.
    Shows results of the split and returns four dataframes
    '''
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = test_size, random_state = random_state)
    # Show the results of the split
    print ("Training set has {} samples.".format(X_train.shape[0]))
    print ("Testing set has {} samples.".format(X_test.shape[0]))
    return X_train, X_test, y_train, y_test

In [8]:
'''# this doesn't work

#data['category_slug'] = pd.DataFrame([json.loads(data["category"][i])["slug"] for i in range(data.shape[0])])
json_columns = ['category'] #, 'creator', 'urls', 'location']
for col in json_columns:
    for i in range(len(df[col])):
        try:
            dict_cat = json.loads(df[col][i])
        except:
            dict_cat = 0
    df_json = pd.DataFrame.from_dict(dict_cat) 
    df = pd.merge(left=df, right=df_json, left_index=True, right_index=True, suffixes = ('',col) )      
df_json.head()
#    

#df0.head()'''

'# this doesn\'t work\n\n#data[\'category_slug\'] = pd.DataFrame([json.loads(data["category"][i])["slug"] for i in range(data.shape[0])])\njson_columns = [\'category\'] #, \'creator\', \'urls\', \'location\']\nfor col in json_columns:\n    for i in range(len(df[col])):\n        try:\n            dict_cat = json.loads(df[col][i])\n        except:\n            dict_cat = 0\n    df_json = pd.DataFrame.from_dict(dict_cat) \n    df = pd.merge(left=df, right=df_json, left_index=True, right_index=True, suffixes = (\'\',col) )      \ndf_json.head()\n#    \n\n#df0.head()'

In [9]:
def train_using_gini(X_train, X_test, y_train): 
  
    # Creating the classifier object 
    clf_gini = DecisionTreeClassifier(criterion = "gini", 
            max_depth=3, min_samples_leaf=5) 
    # Performing training 
    clf_gini.fit(X_train, y_train) 
    return clf_gini 

In [10]:
# Function to make predictions 
def prediction(X_test, clf_object): 
  
    # Predicton on test data with model trained using either giniIndex / or entropy
    y_pred = clf_object.predict(X_test) 
    print("Predicted values:\n") 
    print(y_pred) 
    return y_pred 

In [11]:
def cal_accuracy(y_test, y_pred): 
      
    print("Confusion Matrix: \n", 
        confusion_matrix(y_test, y_pred)) 
      
    print ("Accuracy : \n", 
    accuracy_score(y_test,y_pred)*100) 
      
    print("Report : \n", 
    classification_report(y_test, y_pred)) 

In [20]:
""" MAIN SCRIPT ============================================================="""
if __name__ == '__main__':
    
    # Read data
    df = load_kickstarter_data_short('kickstarter/data')
    # drop campaigns that are still ongoing
    features_raw = extract_json_data(df)
    df = df[df.state != 'live']

    # Split the data into features and target label
    target, features_raw = get_target_features(df)


    # encode target variable 'state' to numerical values, success is 1 all others are fail and 0
    df['target'] = df.state.apply(lambda x: 1 if x == 'successfull' else 0)

    # split categorical columns into dummies
#    features_raw = pd.get_dummies(features_raw)    

    # Clean and augment data
    # scale numerical features
    num_columns = ['goal']
    features_raw = scale_features(df, num_columns)
    
    # Split into training and test set
    X_train, X_test, y_train, y_test = test_train_split_kickstarter(features_raw, target, test_size = 0.2, random_state = 0)
    
    # Fit a simple decision tree first
    clf_gini = train_using_gini(X_train, X_test, y_train) 
    
    # Create predictions using simple model
    y_pred = prediction(X_test, clf_gini)
    
    # show results
    cal_accuracy(y_test, y_pred)
    
    # Creating the classifier object 


    # Fit Adaboost classifier using a decision tree as base estimator
    # Test with different number of iterations
    # Append the score output from each iteration of the adboost function
    
    # Compare error rate vs number of iterations
    #   plot_error_rate(er_train, er_test)

Total files: 56
File number 1 added to dataframe
File number 2 added to dataframe
File import done
json columns extracted
target and features split is done
Training set has 5854 samples.
Testing set has 1464 samples.


ValueError: could not convert string to float: 'The Maderati: A bitingly witty absurdest comedy, which pokes wickedly perceptive fun at NY artist lifestyle.'

In [26]:
df = load_kickstarter_data_short('kickstarter/data')
    # drop campaigns that are still ongoing

Total files: 56
File number 1 added to dataframe
File number 2 added to dataframe
File import done


In [27]:
df['target'] = df.state.apply(lambda x: 1 if x == 'successful' else 0)

In [28]:
df.state.unique()

array(['live', 'successful', 'failed', 'canceled', 'suspended'],
      dtype=object)

In [29]:
df.target.unique()

array([0, 1])

In [None]:
    # encode target variable 'state' to numerical values, success is 1 all others are fail and 0
# target = target.apply(lambda x: 1 if x == 'success' else 0)

In [None]:
    # split categorical columns into dummies
#    features_raw = pd.get_dummies(features_raw)    

    # Clean and augment data

In [None]:
num_columns = ['goal']
features_raw = scale_features(df, num_columns)

In [None]:
# Split into training and test set
X_train, X_test, y_train, y_test = test_train_split_kickstarter(features_raw, target, test_size = 0.2, random_state = 0)

In [None]:
X_train.shape

In [None]:
# Fit a simple decision tree first
clf_gini = train_using_gini(X_train, X_test, y_train) 

In [None]:
# Create predictions using simple model
y_pred = prediction(X_test, clf_gini)

# show results
cal_accuracy(y_test, y_pred)

# Creating the classifier object 


# Fit Adaboost classifier using a decision tree as base estimator
# Test with different number of iterations
# Append the score output from each iteration of the adboost function

# Compare error rate vs number of iterations
#   plot_error_rate(er_train, er_test)