In [2]:
# Cell 0
# All the needed imports

import numpy as np
import pandas as pd
#%pip install nltk
import nltk
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay
)
import matplotlib.pyplot as plt
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
from IPython.display import display



In [3]:
#nltk.download('all')

In [4]:
# Cell 1
# Load the raw data

original_df = pd.read_csv('ks-projects-201801.csv')

# Remove all nans
original_df = original_df.dropna()
display(original_df.head())

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [5]:
# Cell 2
# Reorganize the data

# Drop the following columns: ID, usd_pledged, usd_pledged_real, usd_goal_real
df = original_df.drop(
    ["pledged", "usd pledged", "usd_pledged_real", "goal", "backers"], axis=1
)

# Rename the columns
df = df.rename(
    columns={
        "ID": "id",
        "name": "name",
        "category": "category",
        "main_category": "main_category",
        "deadline": "deadline",
        "launched": "launched",
        "state": "state",
        "currency": "currency",
        "country": "country",
        "usd_goal_real": "goal",
    }
)

# Conver the launch and deadline to year-month-day
df['launched'] = pd.to_datetime(pd.to_datetime(df['launched'], format="%Y-%m-%d %H:%M:%S").dt.date)
df['deadline'] = pd.to_datetime(pd.to_datetime(df['deadline'], format="%Y-%m-%d").dt.date)

# Calculate the amount of days between launch and deadline
df['durration'] = (df['deadline'] - df['launched']).dt.days
df['start_month'] = df['launched'].dt.month_name()
df['end_month'] = df['deadline'].dt.month_name()
df['start_day_name'] = df['launched'].dt.day_name()
df['end_day_name'] = df['deadline'].dt.day_name()
display(df.head())

Unnamed: 0,id,name,category,main_category,currency,deadline,launched,state,country,goal,durration,start_month,end_month,start_day_name,end_day_name
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,2015-08-11,failed,GB,1533.95,59,August,October,Tuesday,Friday
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,2017-09-02,failed,US,30000.0,60,September,November,Saturday,Wednesday
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,2013-01-12,failed,US,45000.0,45,January,February,Saturday,Tuesday
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,2012-03-17,failed,US,5000.0,30,March,April,Saturday,Monday
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,2015-07-04,canceled,US,19500.0,56,July,August,Saturday,Saturday


In [6]:
# Cell 3
# Print start shape
print(df.shape)

# Keep only success or fail
df = df[(df['state'] == 'failed') | (df['state'] == 'successful')]

# Remove country
df = df[df['country'] != 'N,0"']

# df = df.drop(columns=['currency', 'country'], axis=1)

# Reset index
df = df.reset_index(drop=True)

# Print end shape
print(df.shape)

# Final data before one hot encoding everything
display(df.head())

(374860, 15)
(331462, 15)


Unnamed: 0,id,name,category,main_category,currency,deadline,launched,state,country,goal,durration,start_month,end_month,start_day_name,end_day_name
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,2015-08-11,failed,GB,1533.95,59,August,October,Tuesday,Friday
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,2017-09-02,failed,US,30000.0,60,September,November,Saturday,Wednesday
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,2013-01-12,failed,US,45000.0,45,January,February,Saturday,Tuesday
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,2012-03-17,failed,US,5000.0,30,March,April,Saturday,Monday
4,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,2016-02-26,successful,US,50000.0,35,February,April,Friday,Friday


In [8]:
df.head()

Unnamed: 0,id,name,category,main_category,currency,deadline,launched,state,country,goal,durration,start_month,end_month,start_day_name,end_day_name
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,2015-08-11,failed,GB,1533.95,59,August,October,Tuesday,Friday
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,2017-09-02,failed,US,30000.0,60,September,November,Saturday,Wednesday
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,2013-01-12,failed,US,45000.0,45,January,February,Saturday,Tuesday
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,2012-03-17,failed,US,5000.0,30,March,April,Saturday,Monday
4,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,2016-02-26,successful,US,50000.0,35,February,April,Friday,Friday


In [38]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
#Do A five num sum of main_category
def stat(df):
    uniques = df['main_category'].unique()

    for cat in uniques:
        rows = df[df['main_category'] == cat]
        goal = rows['goal']
        print(f"Main Category {cat}")
        print(f"How much of the data is this category {rows.shape[0] / df.shape[0]}")
        print(f"success vs failure: {(rows['state']=='successful').sum()} vs {(rows['state']=='failed').sum()} ratio {(rows['state']=='successful').sum() / ((rows['state']=='successful').sum() + (rows['state']=='failed').sum())}")
        print(f"Average Duration: {rows['durration'].mean()}")
        print(f"5 Number Summary of Goal Amount: {goal.describe()}")
        print()


stat(df)



Main Category Publishing
How much of the data is this category 0.10683879298381112
success vs failure: 12300 vs 23113 ratio 0.3473300765255697
Average Duration: 33.727953011605905
5 Number Summary of Goal Amount: count       35413.00
mean        17555.34
std        588173.02
min             0.55
25%          2000.00
50%          4995.00
75%         10000.00
max     100000000.00
Name: goal, dtype: float64

Main Category Film & Video
How much of the data is this category 0.1704659961021173
success vs failure: 23612 vs 32891 ratio 0.41788931561156045
Average Duration: 35.00943312744456
5 Number Summary of Goal Amount: count       56503.00
mean        76122.96
std       1703544.55
min             0.15
25%          2500.00
50%          6400.00
75%         20000.00
max     151395869.92
Name: goal, dtype: float64

Main Category Music
How much of the data is this category 0.13817873542065154
success vs failure: 24105 vs 21696 ratio 0.526298552433353
Average Duration: 35.30933822405624
5 Number