In [28]:
# Import the required libraries and dependencies
import pandas as pd
from pathlib import Path

## Import data 

In [29]:
# Import indiegogo dataset
indiegogo_df = pd.read_csv(
    Path('./Resources/indiegogo_data/indiegogo.csv')
)

# Fix ID column
indiegogo_df.rename(columns={'project_id' : 'ID'}, inplace=True)

# Set the index as the ID
indiegogo_df.set_index('ID', inplace=True)

# View head
indiegogo_df.head(2)

Unnamed: 0_level_0,currency,category,year_end,month_end,day_end,time_end,amount_raised,funded_percent,in_demand,year_launch,...,apr,may,jun,jul,aug,sep,oct,nov,dec,tperiod
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3936,USD,Transportation,2010,5,12,23:59:00,840,16.80%,False,2010,...,1,0,0,0,0,0,0,0,0,1
5109,USD,Human Rights,2010,7,2,23:59:00,250,20.83%,False,2010,...,0,0,1,0,0,0,0,0,0,2


## Rename cols

In [30]:
# Rename columns to match the kickstarter df
indiegogo_df.rename(columns={'title' : 'name', 'amount_raised_usd' : 'usd_pledged_real', 'goal_usd' : 'usd_goal_real'}, inplace=True)

## Clean df

In [31]:
# fix dates
indiegogo_df['launched'] = pd.to_datetime(indiegogo_df['date_launch'] + ' ' + indiegogo_df['time_launch'])
indiegogo_df['deadline'] = pd.to_datetime(indiegogo_df['date_end'] + ' ' + indiegogo_df['time_end'])

# Clean the main_category col
categories_to_delete = ['Local Businesses', 'Environment' , 'Home' , 'Education', 'Travel & Outdoors']
indiegogo_df = indiegogo_df[~indiegogo_df.loc[:,'category'].isin(categories_to_delete)]

indiegogo_df

Unnamed: 0_level_0,currency,category,year_end,month_end,day_end,time_end,amount_raised,funded_percent,in_demand,year_launch,...,jun,jul,aug,sep,oct,nov,dec,tperiod,launched,deadline
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3936,USD,Transportation,2010,5,12,23:59:00,840,16.80%,False,2010,...,0,0,0,0,0,0,0,1,2010-04-21 22:38:42,2010-05-12 23:59:00
5109,USD,Human Rights,2010,7,2,23:59:00,250,20.83%,False,2010,...,1,0,0,0,0,0,0,2,2010-06-10 17:47:35,2010-07-02 23:59:00
5307,USD,Human Rights,2010,7,10,23:59:00,200,16.67%,False,2010,...,1,0,0,0,0,0,0,3,2010-06-18 09:49:01,2010-07-10 23:59:00
8731,USD,Photography,2010,10,9,23:59:00,500,25.00%,False,2010,...,0,0,0,1,0,0,0,4,2010-09-09 13:50:48,2010-10-09 23:59:00
9165,USD,Human Rights,2011,1,12,23:59:00,360,0.65%,False,2010,...,0,0,0,1,0,0,0,5,2010-09-14 18:38:51,2011-01-12 23:59:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2584905,USD,Film,2020,8,30,23:59:59,1351,14.47%,False,2020,...,0,1,0,0,0,0,0,22115,2020-07-16 00:00:00,2020-08-30 23:59:59
2599602,CHF,Music,2020,8,16,23:59:59,76,2.17%,False,2020,...,0,1,0,0,0,0,0,22116,2020-07-17 00:00:00,2020-08-16 23:59:59
2595782,EUR,Fashion & Wearables,2020,9,15,23:59:59,73,2.43%,False,2020,...,0,1,0,0,0,0,0,22117,2020-07-17 00:00:00,2020-09-15 23:59:59
2617141,USD,Comics,2020,8,1,23:59:59,100,5.00%,False,2020,...,0,1,0,0,0,0,0,22118,2020-07-17 00:00:00,2020-08-01 23:59:59


## Create dfs from selected columns

In [32]:
# List columns to drop
indiegogo_cols_drop = ['year_end',
'month_end',
'day_end',
'amount_raised',
'in_demand',
'year_launch',
'month_launch',
'day_launch',
'tagline',
'url',
'australia',
'canada',
'switzerland',
'denmark',
'western_europe',
'great_britain',
'hong_kong',
'norway',
'sweden',
'singapore',
'united_states',
'education',
'productivity',
'energy_greentech',
'wellness',
'comics',
'fashion_wearables',
'video_games',
'photography',
'tv_shows',
'dance_theater',
'phones_accessories',
'audio',
'film',
'transportation',
'art',
'environment',
'writing_publishing',
'music',
'travel_outdoors',
'health_fitness',
'tabletop_games',
'home',
'local_business',
'food_beverage',
'culture',
'human_rights',
'podcasts_vlogs',
'camera_gear',
'time_end',
'time_launch',
'date_launch',
'date_end',
'jan',
'feb',
'mar',
'apr',
'may',
'jun',
'jul',
'aug',
'sep',
'oct',
'nov',
'dec',
'tperiod']

# Selected columns indiegogo df
indiegogo_selected_cols_df = indiegogo_df.drop(indiegogo_cols_drop, axis=1)

## Add new cols

In [33]:
# Duration
indiegogo_selected_cols_df['duration'] = indiegogo_selected_cols_df['deadline'] - indiegogo_selected_cols_df['launched'] 
indiegogo_selected_cols_df['duration'] = indiegogo_selected_cols_df['duration'].dt.days

# Daily Goal 
indiegogo_selected_cols_df['daily_goal'] = round(indiegogo_selected_cols_df['usd_goal_real'] / indiegogo_selected_cols_df['duration'],2)

# Daily Pledged
indiegogo_selected_cols_df['daily_pledged'] = round(indiegogo_selected_cols_df['usd_pledged_real'] / indiegogo_selected_cols_df['duration'],2)

# Funded Percentage
indiegogo_selected_cols_df['funded_percent'] = round(indiegogo_selected_cols_df['usd_pledged_real'] / indiegogo_selected_cols_df['usd_goal_real'],4)

# Add country col using currency/country matchings from kickstarter large dataset
# Import kickstarter data
# Import large kickstarter recent dataset
kickstarter_currency_country_cols_df = pd.read_csv(
    Path('./Resources/kickstarter_data/ks-projects-201801.csv'),
    usecols=['currency', 'country']
)

# Imported data has a space at the end of the column name
# Remove spaces in columns name
kickstarter_currency_country_cols_df.columns = kickstarter_currency_country_cols_df.columns.str.replace(' ','')


# Create a dictionary of country and currency pairs to add country col
country_currency_df = kickstarter_currency_country_cols_df.loc[:,['country', 'currency']]
country_currency_df.drop(kickstarter_currency_country_cols_df[kickstarter_currency_country_cols_df['country'] == 'N,0"'].index, inplace=True)
country_currency_df.drop_duplicates(inplace=True)
country_currency_df.set_index('currency', inplace = True)
currency_country_dict = country_currency_df.to_dict()['country']

# Clean up all the N,0" values for countries using the currency_country_dict
indiegogo_selected_cols_df['country'] = indiegogo_selected_cols_df.apply(lambda row: currency_country_dict[row['currency']], axis=1)

# Add full country name col
# Add full country name col
%run ./consts.ipynb
indiegogo_selected_cols_df['full_country_names'] = indiegogo_selected_cols_df['country'].apply(lambda x: full_country_names_dict[x])

# Add lat/long depends on running ./Util/conts.ipynb first
indiegogo_selected_cols_df['lat'] = indiegogo_selected_cols_df['country'].apply(lambda x: country_lat_long_dict[x][0])
indiegogo_selected_cols_df['long'] = indiegogo_selected_cols_df['country'].apply(lambda x: country_lat_long_dict[x][1])

# Add main category depends on running ./Util/conts.ipynb first
indiegogo_selected_cols_df['main_category'] = indiegogo_selected_cols_df['category'].apply(lambda x: kickstarter_cat_dict[x])


display(indiegogo_selected_cols_df.head(2))
# Fix types
indiegogo_selected_cols_df = indiegogo_selected_cols_df.astype({"funded_percent": 'float'})

Unnamed: 0_level_0,currency,category,funded_percent,name,state,usd_pledged_real,usd_goal_real,launched,deadline,duration,daily_goal,daily_pledged,country,full_country_names,lat,long,main_category
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
3936,USD,Transportation,0.168,Join the Electric Revolution!!!,0,840.0,5000.0,2010-04-21 22:38:42,2010-05-12 23:59:00,21,238.1,40.0,US,United States,37.09024,-95.712891,Technology
5109,USD,Human Rights,0.2083,Relief Trip to Haiti,0,250.0,1200.0,2010-06-10 17:47:35,2010-07-02 23:59:00,22,54.55,11.36,US,United States,37.09024,-95.712891,Journalism


## Reorder cols

In [34]:
# Base column names for reordering
base_order = ['name', 'main_category', 'category', 'currency', 'usd_goal_real', 'usd_pledged_real', 'deadline', 'launched', 'state', 'funded_percent','duration', 'daily_goal', 'daily_pledged', 'country', 'full_country_names', 'lat', 'long']
indiegogo_order = base_order

# Reorder each df
indiegogo_selected_cols_df = indiegogo_selected_cols_df[indiegogo_order]

## Write clean data to files

In [17]:
indiegogo_selected_cols_df.to_csv('./Resources/indiegogo_data_clean/indiegogo-projects.csv')