In [1]:
# import packages 
import pandas as pd
import numpy as np
import matplotlib
import glob
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [2]:
def load_kickstarter_data(datapath):
    '''datapath = location of csv files to be loaded'''
    # List with the names of all the csv files in the path
    csv_files = glob.glob(datapath+'/*.csv')

    print(f'Total files: {len(csv_files)}')

    # Loop through the files
    for file_idx, csv_file in enumerate(csv_files): 
        # create dataframe from 1st csv       
        if file_idx == 0:
            df_ks = pd.read_csv(csv_file)
            print(f'File number {file_idx + 1} added to dataframe')
        else:
            # create dataframe from idx csv
            df = pd.read_csv(csv_file)
            # check files are all in same
            if  np.all(df.columns == df_ks.columns) == False:
                print(f'Column format of {csv_file} does not match {csv_files[0]}. Please check and try again')
                return
            else:
                # append to initial dataframe                   
                df_ks = pd.concat([df_ks, df], axis=0, ignore_index=True)       
                print(f'File number {file_idx + 1} added to dataframe')
    print('File import done')
    return df_ks

In [3]:
def load_kickstarter_data_short(datapath):
    ''' 
    This is a version of the main function to load jsut 2 files for use in testing
    datapath = location of csv files to be loaded
    '''
    # List with the names of all the csv files in the path
    csv_files = glob.glob(datapath+'/*.csv')

    print(f'Total files: {len(csv_files)}')

    # Loop through the files
    for file_idx, csv_file in enumerate(csv_files): 
        # create dataframe from 1st csv       
        if file_idx == 0:
            df_ks = pd.read_csv(csv_file)
            print(f'File number {file_idx + 1} added to dataframe')
        else:
            # create dataframe from idx csv
            df = pd.read_csv(csv_file)
            # check files are all in same
            if  np.all(df.columns == df_ks.columns) == False:
                print(f'Column format of {csv_file} does not match {csv_files[0]}. Please check and try again')
                return
            else:
                # append to initial dataframe                   
                df_ks = pd.concat([df_ks, df], axis=0, ignore_index=True)       
                print(f'File number {file_idx + 1} added to dataframe')
                # This is here to prevent more than 2 files being loaded to save time in testing
                break
    print('File import done')
    return df_ks

In [4]:
def extract_json_data(data):
    ''' This function extracts specific sub fields from json files embedded in columns of a dataframe
        data: dataframe containing column with json data'''
    data['category_name'] = pd.DataFrame.from_dict([json.loads(data["category"][i])['name'] for i in range(data.shape[0])])
    data['category_slug'] = pd.DataFrame([json.loads(data["category"][i])['slug'] for i in range(data.shape[0])])
    # TODO split slug into main category and sub category
    data.drop(labels = 'category', axis=1, inplace=True)
    
    print('json columns extracted')
    return data

In [5]:
def get_duration(data):
    #Convert from unix time stamp to more readable time format
    data['converted_deadline'] = pd.to_datetime(data['deadline'], unit='s')
    data['converted_launched_at'] = pd.to_datetime(data['launched_at'], unit='s')
    #Create project duration variable
    data['project_duration_days'] = (data['converted_deadline'] - data['converted_launched_at']).dt.days
    # Drop redundant columns
    data.drop(columns=['deadline', 'launched_at'], inplace=True)
    return data

In [6]:
def get_target(data,target='state', new_target_var='success', success_label='successful'):
    '''
    creates a dummy variable out of the state to be used as dependant variable
    '''
    #data('success') = data['state'].apply(lambda x: 1 if x == 'successful' else 0)
    data[new_target_var] = data[target].apply(lambda x: 1 if x == success_label else 0)
    return data

In [7]:
def get_target_and_features(data, target_var='success'):
    '''
    Function that splits dataset into target and feature dataframes
    '''
    #target = data['success']
    target = data[target_var]
    data.drop([target_var,'state'], axis = 1, inplace=True)
    features = data

    print('target and features split is done')
    return target, features

In [8]:
def currency_conversion(data):
    # Convert the currency of all projects to USD. 
    # We use static_usd_rate since this is what was used for usd_pledged
    data['usd_goal'] = data['goal'] * data['static_usd_rate']
    # drop goal and static_usd_rate to remove redundant data
    data.drop(columns=['goal','static_usd_rate'], inplace=True)
    return data


In [9]:
def feature_engineering(data):
    '''
    Create new features: Blurb length
    '''
    data['blurb_length'] = data.blurb.apply(lambda x: len(str(x)))
    return data

In [10]:
def scale_features(data, num_columns=['usd_goal','project_duration_days','blurb_length']):
    ''' Initialize a scaler, then apply it to the features'''
    
    scaler = MinMaxScaler()
    data[num_columns] = scaler.fit_transform(data[num_columns])
    
    return data

In [11]:
def drop_columns(data):
    '''remove unnecessary columns'''
    # Drop due to many missing values
    data.drop(columns = ['friends', 'is_backing', 'is_starred', 'permissions'], inplace=True)
    # Some Json strings varariables with unusable or already used data
    data.drop(columns = ['creator', 'location', 'photo', 'profile', 'slug', 'urls'], inplace=True)
    # Columns that are not specific to the campaign or are redundant or are technical data unrelated to campaign
    data.drop(columns = ['created_at','currency', 'currency_symbol', 'currency_trailing_code', 
                     'current_currency', 'disable_communication',
                     'is_starrable', 'source_url', 'spotlight', 'staff_pick', 
                     'usd_type', 'state_changed_at','fx_rate'], inplace=True)
    # drop columns due to being linked to dependent variable which would not be known in advance
    data.drop(columns = [ 'converted_pledged_amount', 'pledged', 'usd_pledged','id'], inplace=True) # to be checked 'backers_count'
    # drop columns that are not used                
    data.drop(columns = ['blurb', 'name', 'converted_deadline', 'converted_launched_at','category_name'], inplace=True) #'category_slug'
    return data

In [12]:
def test_train_split_kickstarter(features, target, test_size = 0.2, random_state = 0):
    '''
    Split data into train and test sets based on features and target dataframes.
    Shows results of the split and returns four dataframes
    '''
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = test_size, random_state = random_state)
    # Show the results of the split
    print ("Training set has {} samples.".format(X_train.shape[0]))
    print ("Testing set has {} samples.".format(X_test.shape[0]))
    return X_train, X_test, y_train, y_test

In [13]:
def train_using_gini(X_train, y_train): 
  
    # Creating the decision tree classifier object 
    clf_tree = DecisionTreeClassifier(criterion = "gini", 
            max_depth=3, min_samples_leaf=5) 
    # Performing training 
    clf_tree.fit(X_train, y_train) 
    return clf_tree

In [14]:
# Function to make predictions 
def prediction(X_test, clf_object): 
  
    # Predicton on test data with model trained using either giniIndex
    y_pred = clf_object.predict(X_test) 
    print("Predicted values:\n") 
    print(y_pred) 
    return y_pred 

In [15]:
def cal_accuracy(y_test, y_pred): 
      
    print("Confusion Matrix: \n", 
    confusion_matrix(y_test, y_pred)) 
      
    print ("Accuracy : \n", 
    accuracy_score(y_test, y_pred)*100) 
      
    print("Report : \n", 
    classification_report(y_test, y_pred)) 

In [17]:
""" MAIN SCRIPT ============================================================="""
if __name__ == '__main__':
    
    # Read data

    df = load_kickstarter_data('kickstarter/data')
    
    # Extract category data from json
    df = extract_json_data(df)
    
    # drop campaigns that are still ongoing
    df = df[df.state != 'live'] # change this to sucessful and 
    
    # convert unix timestamps and calculate campaign duration
    df = get_duration(df)
    
    # Get blurb length
    df = feature_engineering(df)
    
    #create goal data in single currency
    df = currency_conversion(df)
    
    # encode target variable 'state' to numerical values, success is 1 all others are fail and 0
    df = get_target(df,target='state', new_target_var='success', success_label='successful')

    # drop unnecessary columns
    df=drop_columns(df)
    df.head()

    # Split the data into features and target label
    target, features = get_target_and_features(df)

    # split categorical columns into dummies
    features = pd.get_dummies(features, columns=['country', 'category_slug'], drop_first=True) #Avoid dummy trap   

    # Clean and augment data
    # scale numerical features
    num_columns = ['usd_goal','project_duration_days']
    features = scale_features(features, num_columns)
    
    # Split into training and test set
    X_train, X_test, y_train, y_test = test_train_split_kickstarter(features, target, test_size = 0.2, random_state = 0)
    
    # Fit a simple decision tree first
    clf_gini = train_using_gini(X_train, y_train) 
    
    # Create predictions using simple model
    y_pred = prediction(X_test, clf_object=clf_gini)
    
    # show results
    cal_accuracy(y_test, y_pred)

Total files: 56
File number 1 added to dataframe
File number 2 added to dataframe
File number 3 added to dataframe
File number 4 added to dataframe
File number 5 added to dataframe
File number 6 added to dataframe
File number 7 added to dataframe
File number 8 added to dataframe
File number 9 added to dataframe
File number 10 added to dataframe
File number 11 added to dataframe
File number 12 added to dataframe
File number 13 added to dataframe
File number 14 added to dataframe
File number 15 added to dataframe
File number 16 added to dataframe
File number 17 added to dataframe
File number 18 added to dataframe
File number 19 added to dataframe
File number 20 added to dataframe
File number 21 added to dataframe
File number 22 added to dataframe
File number 23 added to dataframe
File number 24 added to dataframe
File number 25 added to dataframe
File number 26 added to dataframe
File number 27 added to dataframe
File number 28 added to dataframe
File number 29 added to dataframe
File nu

In [25]:
df['blurb_length'] = df.blurb.apply(lambda x: len((x)))

TypeError: object of type 'float' has no len()

In [17]:
features.head()

Unnamed: 0,backers_count,country,category_slug,project_duration_days,blurb_length,usd_goal
1,47,US,games/playing cards,0.325843,125,1.2e-05
2,271,US,music/rock,0.325843,108,0.000187
3,3,GB,games/playing cards,0.651685,133,0.000152
4,3,US,publishing/nonfiction,0.325843,121,3.5e-05
5,35,US,music/classical music,0.325843,134,4.4e-05


In [18]:
# Check shape of dataframes
print(df.columns)
print(df.shape)
print(X_test.shape)
print(y_test.shape)
print(X_train.shape)
print(y_train.shape)
print(y_train.dtype)
print(y_pred.shape)

Index(['backers_count', 'country', 'state', 'category_name', 'category_slug',
       'project_duration_days', 'blurb_length', 'usd_goal', 'success'],
      dtype='object')
(7318, 9)
(1464, 358)
(1464,)
(5854, 358)
(5854,)
int64
(1464,)


In [19]:
df.category_slug.head()

1      games/playing cards
2               music/rock
3      games/playing cards
4    publishing/nonfiction
5    music/classical music
Name: category_slug, dtype: object

In [7]:
df.state.unique()

NameError: name 'df' is not defined

In [None]:
df['success'] = df.state.apply(lambda x: True if x == 'successful' else False)
df['match'] = df['success'] == df['reached_goal']
df.query('state == "successful"').match.value_counts()

In [None]:
# Stuff for figureing out exchange rates
df['implied_fx_rate'] = df['usd_pledged'] / df['pledged']
df[['usd_pledged','implied_fx_rate','fx_rate','static_usd_rate']].head(10)
df['usd_goal'] = df['goal'] * df['static_usd_rate']
df['reached_goal'] = df['usd_goal'] < df['usd_pledged']
df['success'] = df.state.apply(lambda x: True if x == 'successful' else False)
df['match'] = df['success'] == df['reached_goal']
df.query('state == "successful" & usd_type == "international"').match.value_counts()