In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from os.path import dirname
import datetime as dt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler


notebook_path = os.path.abspath("DataAnalyticsKickstarterNotebook_Cedrik.ipynb")
csv_path = os.path.join(os.path.dirname(dirname(notebook_path)), "data/ks-project-edited-merged.csv")
csv_path_oneProject = os.path.join(os.path.dirname(dirname(notebook_path)), "data/ks-project-edited-oneProject.csv")
csv_path_multipleProjects = os.path.join(os.path.dirname(dirname(notebook_path)), "data/ks-project-edited-multipleProjects.csv")

In [25]:
df = pd.read_csv (csv_path, low_memory=False)
df_oneProject = pd.read_csv (csv_path_oneProject, low_memory=False)
df_multipleProjects = pd.read_csv (csv_path_multipleProjects, low_memory=False)

df["name_length"] = df["name"].str.len()

df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,duration,creator_id,name_length
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,59,753774991,31
1,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,successful,224,US,52375.0,52375.0,50000.0,35,362504450,20
2,1000030581,Chaser Strips. Our Strips make Shots their B*tch!,Drinks,Food,USD,2016-03-17,25000.0,2016-02-01 20:05:12,453.0,failed,40,US,453.0,453.0,25000.0,45,1295394884,49
3,100005484,Lisa Lim New CD!,Indie Rock,Music,USD,2013-04-08,12500.0,2013-03-09 06:42:58,12700.0,successful,100,US,12700.0,12700.0,12500.0,30,1116977628,16
4,1000081649,MikeyJ clothing brand fundraiser,Childrenswear,Fashion,AUD,2017-09-07,2500.0,2017-08-08 01:20:20,1.0,failed,1,AU,0.0,0.81,2026.1,30,1942626789,32


In [26]:
list_features = ["usd_goal_real","duration","name_length", "creator_id", "main_category"]
df_features = df[list_features].copy()
df_features.head()

Unnamed: 0,usd_goal_real,duration,name_length,creator_id,main_category
0,1533.95,59,31,753774991,Publishing
1,50000.0,35,20,362504450,Food
2,25000.0,45,49,1295394884,Food
3,12500.0,30,16,1116977628,Music
4,2026.1,30,32,1942626789,Fashion


In [27]:
df_features = pd.concat([df_features.drop('main_category', axis=1), pd.get_dummies(df_features['main_category'])], axis=1)

In [11]:
df_features.head()

Unnamed: 0,usd_goal_real,duration,name_length,creator_id,Art,Comics,Crafts,Dance,Design,Fashion,Film & Video,Food,Games,Journalism,Music,Photography,Publishing,Technology,Theater
0,1533.95,59,31,753774991,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,50000.0,35,20,362504450,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,25000.0,45,49,1295394884,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,12500.0,30,16,1116977628,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,2026.1,30,32,1942626789,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [18]:
scaler = MinMaxScaler()
list_scaler = ["usd_goal_real","duration","name_length"]
for feature in list_scaler:
    scaler.fit(df_features[[feature]])
    df_features[feature] = scaler.fit_transform(df_features[[feature]])
        

In [21]:
df_features.head()

Unnamed: 0,usd_goal_real,duration,name_length,creator_id,Art,Comics,Crafts,Dance,Design,Fashion,Film & Video,Food,Games,Journalism,Music,Photography,Publishing,Technology,Theater
0,1e-05,0.637363,0.315789,753774991,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0.00033,0.373626,0.2,362504450,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,0.000165,0.483516,0.505263,1295394884,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,8.3e-05,0.318681,0.157895,1116977628,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,1.3e-05,0.318681,0.326316,1942626789,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [31]:
df_features["creator_type"] = 3
df_features.head()

Unnamed: 0,usd_goal_real,duration,name_length,creator_id,Art,Comics,Crafts,Dance,Design,Fashion,Film & Video,Food,Games,Journalism,Music,Photography,Publishing,Technology,Theater,creator_type
0,1533.95,59,31,753774991,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,3
1,50000.0,35,20,362504450,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,3
2,25000.0,45,49,1295394884,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,3
3,12500.0,30,16,1116977628,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3
4,2026.1,30,32,1942626789,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3


In [34]:
df_test = df_features[df_oneProject["creator_id"] == df_features["creator_id"]]
df_test

ValueError: Can only compare identically-labeled Series objects

In [28]:
df_oneProject

Unnamed: 0,creator_id,numberOfProjects
0,1191647097,1
1,1175171005,1
2,1486542863,1
3,1438384467,1
4,1192181903,1
...,...,...
100214,757083014,1
100215,757079601,1
100216,757061255,1
100217,757059321,1


In [29]:
df_multipleProjects

Unnamed: 0,creator_id,numberOfProjects
0,1655558466,53
1,2104052526,53
2,957058942,46
3,316444840,44
4,2039317553,39
...,...,...
12078,2146568111,2
12079,996412530,2
12080,173971355,2
12081,84025401,2
