# Introduction / Description

# Import Python packages

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import os # doesnt have to be installed with pip

# set default figure size
plt.rcParams["figure.figsize"] = (8, 5)

# Display all columns with pandas
pd.options.display.max_columns = None

In [16]:
df = pd.read_csv('data/Kickstarter_preprocessed.csv')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209222 entries, 0 to 209221
Data columns (total 75 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   backers_count                      209222 non-null  int64  
 1   description                        209214 non-null  object 
 2   converted_pledged_amount           209222 non-null  int64  
 3   country                            209222 non-null  object 
 4   created_at                         209222 non-null  int64  
 5   currency                           209222 non-null  object 
 6   currency_symbol                    209222 non-null  object 
 7   currency_trailing_code             209222 non-null  bool   
 8   current_currency                   209222 non-null  object 
 9   deadline                           209222 non-null  int64  
 10  disable_communication              209222 non-null  bool   
 11  friends                            300 

In [17]:
# delete first column
df.drop(['Unnamed: 0'],axis=1,inplace=True);

# rename some columns
df.rename(columns={'name_category':'category_sub','slug_category':'category','blurb':'description'},inplace=True)

#### Timestamps are in Unix time and have to be converted --> next cell

In [18]:
# create function for format change
# we could also use lambda x: datetime.utcfromtimestamp(x)
from datetime import datetime

def chg_date(x):
    return datetime.utcfromtimestamp(x)

# apply on dataframe

time_cols = ['created_at','deadline','state_changed_at','launched_at']
df[time_cols] = df[time_cols].applymap(chg_date)

## Basic Feature Engineering

- Parsing 'blurb', i.e. description of project for keywords might be useful but is a the moment timewise not feasible. Instead use only the length of the description
- In order to compare pledge goals we convert them to USD
- Create duration of project as possible indicator

In [19]:
######### First feature engineering

# Replace short description of project with length of description
df['description'] = df['description'].apply(lambda x: len(str(x).split()))

# convert all goals into USD
df = df.eval('usd_goal = static_usd_rate * goal')

# create duration of project
df['duration'] = df['deadline'] - df['launched_at']
df['duration_days']=df['duration'].dt.days

# create year and month in which project ist started
df['start_month']= df['launched_at'].dt.month
df['start_year']= df['launched_at'].dt.year

# split text in column category, keep only left part of string --> main category
df.category = df.category.apply(lambda x: x.split('/')[0])

# change to lower case string
df.category_sub = df.category_sub.str.lower()

In [20]:
features = ['description','duration_days','currency','usd_goal','country','staff_pick','category','category_sub','start_month']
target = df['state']
df[features]

Unnamed: 0,description,duration_days,currency,usd_goal,country,staff_pick,category,category_sub,start_month
0,20,60,USD,5000.000000,US,False,games,live games,4
1,15,30,USD,6000.000000,US,False,film & video,drama,8
2,19,30,USD,10000.000000,US,False,film & video,experimental,7
3,25,30,USD,8000.000000,US,False,journalism,journalism,5
4,11,30,USD,2000.000000,US,False,games,mobile games,9
...,...,...,...,...,...,...,...,...,...
209217,22,31,GBP,1468.271390,GB,False,theater,festivals,6
209218,15,29,GBP,718.323248,GB,False,fashion,accessories,2
209219,13,30,USD,30000.000000,US,False,fashion,footwear,6
209220,10,33,USD,10000.000000,US,False,fashion,footwear,8
