# Kickstarter Project Forecast Model

## 1. First understanding

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
df = pd.read_csv('data/kickstarter.csv')
df.head()

In [None]:
df.info()

## 2. Data cleaning

In [None]:
# drop ID, goal, pledged, usd pledged
df.drop(columns=['ID', 'goal', 'pledged', 'usd pledged','name'], inplace=True)
df.head()

In [None]:
# drop null
df.dropna(inplace=True)
df.info()

## 3. EDA

### 3.1 Tỉ lệ các loại state

In [None]:
variable = 'state'

pie_chart = df[variable].value_counts()
output = pd.DataFrame((df[variable].value_counts()))

In [None]:
temp_pie = pie_chart.iloc[:4]
temp_pie.iloc[3] = pie_chart.iloc[3:].sum()
temp_pie = temp_pie.rename(index= {'undefined' : 'others'})
temp_pie.name = ''

In [None]:
output['Share'] = (output / output.sum()*100).round(2)
output['Share'] = output['Share'].map("{:,.2f}%".format)
output.index.rename('state', inplace=True)

In [None]:
#pie_chart.plot(kind = "pie", startangle=90, autopct='%1.0f%%', pctdistance=1.1, labeldistance=1.2)
temp_pie.plot(kind = "pie", startangle=90, autopct='%1.0f%%', pctdistance=1.1, labeldistance=1.2)
plt.title('Tỉ lệ các loại trạng thái của dự án Kickstarter')
output

### 3.2 Tỉ lệ các loại main_category

In [None]:
variable = 'main_category'

pie_chart = df[variable].value_counts()
output = pd.DataFrame((df[variable].value_counts()))
output

In [None]:
temp_pie = pie_chart.iloc[:10]
temp_pie.iloc[9] = pie_chart.iloc[9:].sum()
# name last category as 'others'
temp_pie = temp_pie.rename(index= {'Theater' : 'others'})
temp_pie.name = ''
temp_pie

In [None]:
temp_pie.plot(kind = "pie", startangle=90, autopct='%1.0f%%', pctdistance=1.1, labeldistance=1.2)

### 3.3 Tỉ lệ thành công của mỗi main category

In [None]:
# number of successful projects and failed projects in each category
df2 = df.groupby(['main_category', 'state']).size().unstack()
df2['total'] = df2['failed'] + df2['successful'] + df2['canceled']
columns = ['successful', 'failed', 'canceled','total']
df2 = df2[columns]
# df2['success_rate'] = (df2['successful'] / df2['total'] * 100).round(2)
df2 = df2.sort_values(by='total', ascending=False)
df2

In [None]:
# bar chart of successfull, failed and canceled projects in each category
df2.plot(kind='bar', figsize=(10, 5))
plt.title('Số lượng dự án thành công, thất bại và bị hủy trong mỗi danh mục')
plt.ylabel('Số lượng')
plt.xlabel('Danh mục chính')
plt.show()

### 3.4 Tổng usd_pledged_real của mỗi main category

In [None]:
# total usd pledged real in each category
df3 = df.groupby('main_category')['usd_pledged_real'].sum().sort_values(ascending=False)
df3 = df3 / 1e6
df3.plot(kind='bar', figsize=(10, 5))
plt.title('Tổng số tiền đã ủng hộ (USD) trong mỗi danh mục')
plt.ylabel('Tổng số tiền đã ủng hộ (triệu USD)')
plt.xlabel('Danh mục chính')

### 3.5 Tác động của usd_pledged_real và usd_goal_real đến khả năng thành công

In [None]:
import seaborn as sns

#Filter dataset by failed and Successfull
df_3_states = df[df['state'].isin(['canceled','failed','successful'])]

# dữ liệu lớn khó quan sát => thu hẹp dữ liệu
df_3_states_small = df_3_states[df_3_states['usd_goal_real'] < 2*1e6]
df_3_states_small = df_3_states_small[df_3_states_small['usd_pledged_real'] < 2*1e6]

plt.figure(figsize=(20,10))
sns.set(style="darkgrid")
sns.lmplot(x="usd_pledged_real", y="usd_goal_real", hue='state',data=df_3_states_small, markers=["x", "o", "*"], palette="Set2")
plt.title("         Pledged and Goal amount in USD impact on Project Status")


### 3.6 Tỉ lệ tham gia của các quốc gia

In [None]:
variable = 'country'

pie_chart = df_3_states[variable].value_counts()
pie_chart.plot(kind = "pie", startangle=90, autopct='%1.0f%%', pctdistance=1.1, labeldistance=1.2)

In [None]:
pie_chart = pie_chart.drop('US')
temp_pie = pie_chart.iloc[:10]
temp_pie.iloc[9] = pie_chart.iloc[9:].sum()
# name last category as 'others'
temp_pie = temp_pie.rename(index= {'MX' : 'others'})
temp_pie.name = ''
temp_pie.plot(kind = "pie", startangle=90, autopct='%1.0f%%', pctdistance=1.1, labeldistance=1.2)


### 3.7 Mối quan hệ backers và usd_pledged_real

In [None]:
# dữ liệu lớn khó quan sát => thu hẹp dữ liệu
df_3_states_small = df_3_states[df_3_states['backers'] < 20000]
df_3_states_small = df_3_states_small[df_3_states_small['usd_pledged_real'] < 2*1e6]


plt.figure(figsize=(20,10))
sns.set(style="darkgrid")
sns.lmplot(x="usd_pledged_real", y="backers", hue='state',data=df_3_states_small, markers=["x", "o", "*"], palette="Set2")
plt.title("         Pledged amount in USD and Backers impact on Project Status")

### 3.8 usd_goal_real và usd_pledged_real trung bình của các quốc gia

In [None]:
plt.figure(figsize=(10,5))
sns.set(style="darkgrid")
df.groupby('country').usd_goal_real.mean().sort_values(ascending=False).head(10).plot.bar()
plt.title("Country by Mean of Goal real amount")

In [None]:
plt.figure(figsize=(10,5))
df.groupby(['country'])['usd_pledged_real'].mean().sort_values(ascending=False).head(10).plot(kind='bar',color = 'g',grid='yes')
plt.title("Country by Mean of Pledged amount")

## 4. Data Pre-processing

In [None]:
# add column launched_month (month of launched)
df['launched_month'] = pd.to_datetime(df['launched']).dt.month

# add column duration (number of days between launched and deadline)
df['deadline'] = pd.to_datetime(df['deadline'])
df['launched'] = pd.to_datetime(df['launched'])

df['duration'] = (df['deadline'] - df['launched']).dt.days

# drop launched and deadline
df.drop(columns=['launched', 'deadline'], inplace=True)

# set canceled as failed
df['state'] = df['state'].apply(lambda x: 'failed' if x == 'canceled' else x)

# drop other than successful and failed
df = df[df['state'].isin(['successful', 'failed'])]

# encode categorical variables
from sklearn.preprocessing import LabelEncoder
lab_enc = LabelEncoder() 
cate_vars = ['main_category', 'country', 'category', 'state', 'currency']
for var in cate_vars:
    df[var] = lab_enc.fit_transform(df[var])


In [None]:
# write to csv
df.to_csv('data/kickstarter_cleaned.csv', index=False)

In [None]:
df.head()

In [None]:
df.info()

## 5. Machine Learning

In [None]:
df = pd.read_csv('data/kickstarter_cleaned.csv')
df.head()

### 5.1 Split Train/Test

In [None]:
X = df.drop(columns=['state'])
y = df['state']

from sklearn.model_selection import train_test_split
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### 5.2 K-Fold for each model

In [None]:
# models in use
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
# metrics
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, auc, roc_curve
# cross validation
from sklearn.model_selection import KFold, cross_val_score

Gaussian Naive Bayes

In [None]:
# K-Fold Cross Validation
var_smoothings = [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
f1_scores = []
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for i in var_smoothings:
    gnb = GaussianNB(var_smoothing=i)
    f1 = cross_val_score(gnb, X_train_val, y_train_val, cv=kf, scoring='f1').mean()
    f1_scores.append(f1)
    print(f'var_smoothing = {i}, f1 = {f1}')

In [None]:
best_var_smoothing = var_smoothings[np.argmax(f1_scores)]
print('Best var_smoothing:', best_var_smoothing)