In [None]:
# import libraries
import category_encoders as ce
import pandas as pd
import glob
import json
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


In [None]:
# use list comprehention and glob to create one db from many
# this works because each db has the same columns
# the csv files used can be found in 'data' in this github repository
# which may need to be copied locally
df1 = pd.concat([pd.read_csv(x) for x in glob.glob('data/Kickstarter*.csv')], ignore_index = True)


In [None]:
pd.options.display.max_columns=50
df1.head(2)

In [None]:
df1.columns

In [None]:
df1.rename(columns={'category':'category_string', 'country':'country_id', 'country_displayable_name':'country', 'state':'campaign_success'}, inplace=True)

In [None]:
df1.info()

In [None]:
df = df1.copy()

In [None]:
df['blurb'] = df['blurb'].astype(str)
df['blurb_length']= df['blurb'].apply(lambda x: len(x))
df['blurb_length'].describe()


In [None]:
df['blurb_length'].value_counts().sort_index()
# df['blurb_length'].value_counts()

## Prepare Category & Subcategory Data for Back End

In [None]:
#prepare category string to be split into several columns
df['category_string']= df['category_string'].map(eval)

In [None]:
# split category into several columns
df_c= df['category_string'].apply(pd.Series)
df_c = df_c.drop(['id', 'position', 'parent_id', 'color', 'urls'], axis=1)
df_c.rename(columns={'name':'subcategory', 'slug':'cat_subcat', 'parent_name':'category'}, inplace=True)
df_c['category'].fillna(df_c['cat_subcat'], inplace=True)
df_c['category'] = df_c['category'].str.capitalize()
df['category']=df_c['category']
df['subcategory']=df_c['subcategory']
df['cat_subcat']=df_c['cat_subcat']


In [None]:
df.info()

In [None]:
# drop duplicate subcategories
df_c = df_c.drop_duplicates(subset=['cat_subcat'])
df_c = df_c.drop(columns=['cat_subcat'])
df_c = df_c.sort_values(by=['category', 'subcategory'])
columns_names = ['category', 'subcategory']
df_c = df_c.reindex(columns=columns_names)
df_c.head(10)

In [None]:
# drop duplicates contries
df_country = df.copy()
df_country = df_country[['country']]
df_country = df_country.sort_values(by=['country'])
df_country = df_country.drop_duplicates()
df_country

In [None]:
categories_subcategories = os.getenv('Kickstart_Category_URL')
country = os.getenv('Kickstart_Country_URL')

df_c.to_json('categories_subcategories.json')
df_country.to_json('country.json')


## Row Evaluation

In [None]:
# check for duplicates of individual projects
dup_id = len(df[df.duplicated(subset='id')])
print(f"There are {dup_id} ids listed more than once.")

In [None]:
df_dup_rows = df[df.duplicated(subset='id', keep=False)]
df_dup_rows.shape

In [None]:
# discover how many rows contain duplicates
dup_rows = len(df_dup_rows)
print(f"There are {dup_rows} rows containing duplicates.")
print(f"That is an average of {(dup_rows/dup_id):.2f} copies per duplicated item.")

In [None]:
# check state values
# combine canceled with failed?  & drop live?
# make booklean
print(df['campaign_success'].value_counts())
print("---"*15)

## Drop rows

In [None]:
print(df.shape)
def row_wrangle(X):
    X = X.copy()
    # drop campaigns which are live or canceled
    # drop duplicate id rows
    X = X[X.campaign_success !='canceled']
    X = X[X.campaign_success != 'live']
    X = X.drop_duplicates(subset=['id'])
    X = X[X.blurb_length > 1]
    X = X.reset_index(drop=True)
    return(X)
df = row_wrangle(df)
print(df.shape)

In [None]:
df.info()

In [None]:
# choose to ignore non USD values
df['current_currency'].value_counts()

In [None]:
df['blurb_length'].value_counts().sort_index()

In [None]:
df.describe()

In [None]:
print(df['disable_communication'].value_counts())
print("---"*15)
print('')
print(df['is_starrable'].value_counts())
print("---"*15)
print('')
print(df['category'].nunique(), 'category items')

## Column Evaluation

In [None]:
# columns to retain as-is (6)
keepers = ['country', 'goal', 'blurb','category', 'launched_at', 'sub_category']

# campaign_success, target column (1)

In [None]:
# future data columns, includes 4 mostly null columns(17)
'''
'spotlight' only applies to successfully funded projects,
'is_starrable' is only true with live campaigns,
'''

future_data = ['backers_count', 'converted_pledged_amount', 'created_at', 'friends',
                'id', 'is_backing', 'is_starred', 'permissions', 'photo',
                'pledged', 'profile', 'source_url', 'spotlight', 'staff_pick',
                'state_changed_at', 'urls', 'usd_pledged']

In [None]:
# extraneous columns (11)
'''
'disable_communications' only contains the value 'false',
'is_starrable' only contains the value 'false',
'country_displayable_name' duplicate information,

'''
extra_columns = ['category_string', 'country_id','creator',
                'disable_communication', 'deadline', 'is_starrable', 'location',
                'name', 'slug', 'cat_subcat', 'blurb_length']


In [None]:
# currency (7)
#'fx_rate' is the foriegn exchange rate,
currency_columns = ['currency', 'currency_symbol', 'currency_trailing_code',
                'current_currency', 'fx_rate', 'static_usd_rate', 'usd_type']

In [None]:
len(df)

## Wrangle Data

In [None]:
def wrangle(X):
    # copy to avoid errors
    X = X.copy()

    # make boolean success column
    X['campaign_success'] = X['campaign_success'].replace({'failed':0, 'successful':1})

    # times are provided in UNIX in seconds
    # subtraction & divided by will provide length in days
    X['campaign_length'] = ((X['deadline']-X['launched_at'])/(60*24*60))

    # Drop future, extraneous & duplicate columns
    X = X.drop(columns=future_data)
    X = X.drop(columns=extra_columns)

    # Currency columns
    X = X.drop(columns=currency_columns)

    return(X)

kickstart = wrangle(df)


In [None]:
kickstart.info()

In [None]:
kickstarter_1 = kickstart.iloc[:100000,:] 
kickstarter_2 = kickstart.iloc[100000:,:] 
print("Shape of new dataframes - {} , {}".format(kickstarter_1.shape, kickstarter_2.shape)) 

In [None]:
Clean_Data_1=os.getenv('Clean_Data_URL_1')
Clean_Data_2=os.getenv('Clean_Data_URL_2')

kickstarter_1.to_csv('Clean_Data_1.csv', index = False)
kickstarter_2.to_csv('Clean_Data_2.csv', index = False)
