In [None]:
# Import Library
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

import plotly.tools as tls
import plotly
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

import warnings
from collections import Counter

# About Data
- Crowdfunding adalah sebuah bentuk pendanaan untuk mereka yang membutuhkan dana dalam pengembangan usahanya, dimana pendanaan tersebut terkumpul dari beberapa orang. Pendekatan ini memanfaatkan upaya kolektif dari sejumlah individu secara online menggunakan sebuah platform atau media sosial crowdfunding, sebagai alat menjangkaunya. Crowdfunding juga adalah salah satu bentuk dari crowdsourcing dan keuangan alternatif yang mulai kembali muncul dan banyak dimanfaatkan orang untuk membantu para pengusaha maupun UKM pemula.

# Data Pre-Processing

In [None]:
# Read Dataset
df = pd.read_csv("data_input/ksp_2018.csv")
df.head()

In [None]:
# check Columns and rows
df.shape

In [None]:
# check data type
df.info()

In [None]:
#Check Missing Value and Finding the missing percentage
import matplotlib.pyplot as plt
df.isna().sum().plot(kind="bar");
missing_values_sum = df.isna().sum()
print(missing_values_sum)
missing_values_percent = df.isna().mean()*100
print(missing_values_percent)

## Handling Missing data

**Handling Missing Values**

The researcher may leave the data or do data imputation to replace the them.  Suppose the number of cases of missing values is extremely small; then, an expert researcher may drop or omit those values from the analysis.  In statistical language, if the number of the cases is less than 5% of the sample, then the researcher can drop them.

In the case of multivariate analysis, if there is a larger number of missing values, then it can be better to drop those cases (rather than do imputation) and replace them.  On the other hand, in univariate analysis, imputation can decrease the amount of bias in the data, if the values are missing at random.
https://www.statisticssolutions.com/missing-values-in-data/

 **`In this case the data we have is 378.661 and the most missing data in the town column as much as 3.797 or 1% of the total data, This does not really matter, considering the amount of data you have compared to the missing data. Next we will drop all rows that have missing values (NA)`**

In [None]:
# Drop missing data
df.dropna()

## Masure of Variance
- Range
- Variance
- Standard Deviation

In [None]:
# range
numcols = df[['goal', 'pledged']]
for col in numcols:
    print(df[col].name + ' range: ' + str(df[col].max() - df[col].min()))

In [None]:
# Variance
print(df['goal'].var())
print(df['pledged'].var())

In [None]:
# Standard Deviation
print(df['goal'].std())
print(df['pledged'].std())

# Exploratory Data Analysis

In [None]:
# Feature selection
df = df[df['currency'].isin(["USD", "EUR"])]
df = df[df["state"].isin(["failed", "successful"])]
df = df.drop(["deadline","backers", "launched", "ID", "country", "pledged", "usd pledged", "usd_pledged_real", "usd_goal_real"], axis = 1)

In [None]:
# Convert data type 
df[['goal']] = df[['goal']].astype('int64')

df[['name', 'category','main_category', 'currency', 'state']] = df[['name','category','main_category', 'currency', 'state']].astype('category')

In [None]:
df.head()

## Masure Of Central Tendency (Ukuran Pemusatan Data)

In [None]:
# Five number summary
df.describe()

In [None]:
print ('Min: ' + str(df['goal'].min()))
print ('Mode: ' + str(df['goal'].mode()[0]))
print ('Median: ' + str(df['goal'].median()))
print ('Mean: ' + str(df['goal'].mean()))
print ('Max: ' + str(df['goal'].max()))

In [None]:
std = df['goal'].std()
std.round(3)

In [None]:
x = df['main_category'].value_counts(sort = True)
x

Kebanyakan dari orang atau kelompok yang melakukan pinjaman lebih kepada kategori film & video.

### Pie Plot & Histogram Distribution

In [None]:
state = round(df["state"].value_counts() / len(df["state"]) * 100,2)

labels = list(state.index)
values = list(state.values)

trace1 = go.Pie(labels=labels, values=values, marker=dict(colors=['red']))

layout = go.Layout(title='Distribuition of States', legend=dict(orientation="h"));

fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)

Dari pie plot diatas dapat dilihat lebih banyak yang gagal melakukan pinjaman dibandingkan yang berhasil.

In [None]:
df_failed = df[df["state"] == "failed"].sample(10000, replace=True)
df_sucess = df[df["state"] == "successful"].sample(10000, replace=True)

#First plot
trace0 = go.Histogram(
    x= np.log(df.goal + 1),
    histnorm='probability', showlegend=False,
    xbins=dict(
        start=-5.0,
        end=19.0,
        size=1),
    autobiny=True)

#Second plot
trace1 = go.Histogram(
    x = np.log(df.goal + 1),
    histnorm='probability', showlegend=False,
    xbins=dict(
        start=-1.0,
        end=17.0,
        size=1))

# Add histogram data
failed = np.log(df_failed['goal']+1)
success = np.log(df_sucess["goal"]+1)

trace3 = go.Histogram(
    x=failed,
    opacity=0.60, nbinsx=30, name='Goals Failed', histnorm='probability'
)
trace4 = go.Histogram(
    x=success,
    opacity=0.60, nbinsx=30, name='Goals Sucessful', histnorm='probability'
)


data = [trace0, trace1, trace3, trace4]
layout = go.Layout(barmode='overlay')

#Creating the grid
fig = plotly.tools.make_subplots(rows=2, cols=2, specs=[ [{'colspan': 2}, None], [{}, {}]],
                          subplot_titles=('Failed and Sucessful Projects',
                                          'Goal','Pledged'))

#setting the figs
fig.append_trace(trace0, 2, 1)
fig.append_trace(trace1, 2, 2)
fig.append_trace(trace3, 1, 1)
fig.append_trace(trace4, 1, 1)

fig['layout'].update(title="Goals Distribuition",
                     height=500, width=900, barmode='overlay')
iplot(fig)

- Meskipun data kami tampak terdistribusi normal, ada baiknya melakukan tes untuk memastikan.
- Jadi, sebelum kita melangkah lebih jauh, mari kita uji apakah data terdistribusi normal?

# Hipothesis Testing

- Normality Test
- T-test (one sample test & Two sample test)
- Permutation Test
- A/B Test
- Chi Squared Test

In [None]:
# Normality Test
stat, p = stats.shapiro(np.log(df['goal']+1).sample(500, random_state=42))

print("Shapiro stat:", stat)
print("P-value: ", p)
if p >= 0.05:
    print('Normal Distribution')
else:
    print("Non-Normal Distribution")

Berdasarkan Normality test diatas bahwa P-value < alpha maka data berdistribusi secara normal (p-value 8 angka di belakang koma)

## T-test (one sample test & Two sample test)

In [None]:
pop = np.random.normal(40613.495, 1058281.871, 100000)

In [None]:
# Perform the one-sample t-test
t_result= stats.ttest_1samp(df.goal, 5000)
print(t_result)

# Test significance
alpha = 0.05
if t_result[1] < alpha:
    print("goal are significantly biased")
else:
    print("No significant bias found")

In [None]:
# Perform the one-sample t-test
t,p = stats.ttest_1samp(df.goal, 5000)
print ("t-statistic:" + str(t))
# ttest_1samp is 2-tailed
print("p-value:" + '%f' % p)
# calculate a 95% confidence interval. 50% of the probability is outside this, 2.5% in each tail
ci = stats.norm.interval(0.95, 0, 1.15)
plt.hist(pop, bins=100)
# show the hypothesized population mean
plt.axvline(pop.mean(), color='yellow', linestyle='dashed', linewidth=2)
# show the confidence interval thresholds - 5% of propbability is under the curve outside these.
plt.axvline(ci[0], color='red', linestyle='dashed', linewidth=2)
plt.axvline(ci[1], color='red', linestyle='dashed', linewidth=2)
# show the t-statistic thresholds - the p-value is the area under the curve outside these
plt.axvline(pop.mean() - t*pop.std(), color='magenta', linestyle='dashed', linewidth=2)
plt.axvline(pop.mean() + t*pop.std(), color='magenta', linestyle='dashed', linewidth=2)
plt.show()

In [None]:
# Create two arrays
successful_goal_ratio = df[df.state == "successful"].goal
failed_goal_ratio = df[df.state == "failed"].goal

# Perform the two-sample t-test
t_result= stats.ttest_ind(successful_goal_ratio, failed_goal_ratio)
print(t_result)

# Test significance
alpha= 0.05
if (t_result[1] < alpha):
    print("successful and failed have different mean goal")
else: print("No significant difference found")

In [None]:
np.random.seed(123)
successful_goal_ratio = df[df.state == "successful"].goal
failed_goal_ratio = df[df.state == "failed"].goal

print("non-math sample mean:" + str(failed_goal_ratio.mean()))
print("math sample mean:" + str(successful_goal_ratio.mean()))

# Independent T-Test
t,p = stats.ttest_ind(successful_goal_ratio, failed_goal_ratio)
# ttest_ind is 2-tailed, so half the resulting p-value to get a 1-tailed p-value
p1 = '%f' % (p/2)
print("t-statistic:" + str(t))
print("p-value:" + str(p1))

pop = np.random.normal(failed_goal_ratio.mean(), failed_goal_ratio.std(), 100000)
# calculate a 90% confidence interval. 10% of the probability is outside this, 5% in each tail
ci = stats.norm.interval(0.90, failed_goal_ratio.mean(), failed_goal_ratio.std())
plt.hist(pop, bins=100)
# show the hypothesized population mean
plt.axvline(pop.mean(), color='yellow', linestyle='dashed', linewidth=2)
# show the right-tail confidence interval threshold - 5% of propbability is under the curve to the right of this.
plt.axvline(ci[1], color='red', linestyle='dashed', linewidth=2)
# show the t-statistic - the p-value is the area under the curve to the right of this
plt.axvline(pop.mean() + t*pop.std(), color='magenta', linestyle='dashed', linewidth=2)
plt.show()

## Permutation Test

In [None]:
# replace rows value 0 as failure and 1 as sucessful
df = df.replace(to_replace = {'state' : {'failed' : 0, 'successful' : 1}}, value = None)

In [None]:
# Convert data type 
df[['state']] = df[['state']].astype('int64')
df.head()

In [None]:
# select main category as Technology and currency as USD
df_perm = df.loc[(df['main_category'] == 'Technology') & (df['currency'] == 'USD')]
df_perm.head()

In [None]:
# select main category as Technology and currency as EUR
df_perm1 = df.loc[(df['main_category'] == 'Technology') & (df['currency'] == 'EUR')]
df_perm1.head()

In [None]:
df_permutation = df_perm['state'].to_numpy()
df_permutation2 = df_perm1['state'].to_numpy()
# Permutation Test
def permutation_sample(dataA, dataB) :
    data = np.concatenate([df_permutation, df_permutation2])
    
    permuted_data =np.random.permutation(data)
    
    perm_sample_1 = permuted_data[:len(dataA)]
    perm_sample_2 = permuted_data[len(dataA):]
    
    return perm_sample_1, perm_sample_2

def draw_perm_reps(data_A,data_B,func,size=1) :
    perm_replicates =np.empty(size)
    
    for i in range(size):
        perm_sample_1, perm_sample_2 = permutation_sample(data_A, data_B)
        
        perm_replicates[i] = func(perm_sample_1, perm_sample_2)
        
    return perm_replicates

def diff_of_means(data_A, data_B): 
    diff = np.mean(data_A)- np.mean(data_B)
    
    return diff

empirical_diff_means = diff_of_means(df_permutation, df_permutation2)

perm_replicates = draw_perm_reps(df_permutation, df_permutation2, diff_of_means, size = 10000)

p = np.sum(perm_replicates >= empirical_diff_means) / len(perm_replicates) # P-value
print(f"P-Value is: {p}")

p-value lebih kecil dari alpha maka tolak H0dan terima H1, jadi 

## A/B Test

In [None]:
df_perm['state'].value_counts()

In [None]:
df_perm1['state'].value_counts()

In [None]:
def diff_frac(data1, data2):
    frac1 = 4724 / len(df_permutation)
    frac2 = 492 / len(df_permutation2)
    return frac2 - frac1

def draw_perm_reps(data_1, data_2, func, size=1) :
    perm_replicates =np.empty(size)
    
    for i in range(size) :
        perm_sample_A, perm_sample_B = permutation_sample(data_1, data_2)
        
        perm_replicates[i] = func(perm_sample_A, perm_sample_B)
    
    return perm_replicates

diff_frac_obs = diff_frac(df_permutation, df_permutation2)

#10000 permutation
perm_replicates = draw_perm_reps(df_permutation, df_permutation2, diff_frac, size=10000)

p_value = np.sum(perm_replicates >= diff_frac_obs) / 10000
print('The p-value is %f' % p_value)

Hasil dari A/B testing menunjukan P-value lebih besar daripada alpha disini kita dapat p value 1, bisa di katakan bahwa kita akan menerima H0 dan menolak H1, berarti dari hasil A/B testing kita bisa menarik kesimpulan bahwa kebanyakan orang gagal dalam melakukan pinjaman pada segment teknologi yang pembayarannya menggunakan USD maupun EUR.

## Chi Square Test

In [None]:
# Chi Squared Test
df_chi = df.groupby(['currency', 'main_category']).count()[['goal']].reset_index()
df_chi

In [None]:
df_pivot = df_chi.pivot(index='main_category', columns='currency', values='goal')
df_pivot.head()

In [None]:
chisq, pvalue, df, expected = stats.chi2_contingency(df_pivot)
print(f'Observerd chi2: {chisq:.4f}')
print(f'p-value: {pvalue:.4f}')

p-value < alpha maka tolak H0 dan Tidak ditemukan perbedaan yang signifikan antara kategori utama dengan mata uang.