In [27]:
# import main libraries
import pandas as pd
import numpy as np

# make pairplots feature vs state
import seaborn as sns
import matplotlib.pyplot as plt

# to evaluate the model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# import data from csv
df = pd.read_csv('data/kickstarter_projects.csv')

In [28]:
#drop collums that are useless or cause data leakage
df = df.drop(columns=['ID', 'Pledged', 'Backers'])

df = df.query(
            'State != "Live" and State != "Suspended" and State != "Canceled" and State != "Unknown"'
            ).reset_index(drop=True)



In [29]:
# i want to take a sample of 10% of the data, for testing
df_sample = df.sample(frac=0.1, random_state=42)
df_sample.reset_index(drop=True, inplace=True)

In [30]:
df_sample

Unnamed: 0,Name,Category,Subcategory,Country,Launched,Deadline,Goal,State
0,"From Hollywood to Hip Hop ""This Is Dante!""",Film & Video,Webseries,United States,2012-10-01 17:29:21,2012-10-31,7000,Failed
1,Sunday: A Portrait of 21st Century England by ...,Photography,Photobooks,United Kingdom,2017-05-11 08:05:44,2017-06-10,7661,Successful
2,The Last Spring: A post apocalyptic novel,Publishing,Fiction,United States,2016-09-17 01:12:30,2016-11-01,5000,Failed
3,Ne' Richa: A NEW ALBUM!!,Music,Music,United States,2012-02-17 17:52:53,2012-04-02,11000,Failed
4,Back-to-Cool US Mall Tour,Music,Music,United States,2013-09-13 06:03:14,2013-09-18,9999,Failed
...,...,...,...,...,...,...,...,...
33141,Ottimo Miami,Food,Vegan,Italy,2015-06-27 08:35:42,2015-07-27,10967,Failed
33142,There's Nothing Wrong With A Song From Yesterday,Music,Country & Folk,United States,2014-04-02 23:19:15,2014-04-28,3500,Successful
33143,Cute As A Button,Art,Installations,United States,2015-03-28 15:18:36,2015-04-22,2000,Successful
33144,Story 7,Publishing,Fiction,United States,2015-10-08 20:15:36,2015-10-10,30,Failed


In [32]:
#TODO: export column names as features and target

features = df.columns.tolist()  
target = 'State'
features.remove(target)

num_features = (['Goal'])
name_features = (['Name'])

cat_features = features.copy()
cat_features.remove(num_features[0])
cat_features.remove(name_features[0])

In [35]:
print(features)
print(target)
print(num_features)
print(name_features)
print(cat_features)

['Name', 'Category', 'Subcategory', 'Country', 'Launched', 'Deadline', 'Goal']
State
['Goal']
['Name']
['Category', 'Subcategory', 'Country', 'Launched', 'Deadline']


In [36]:
#Todo: Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [37]:
#TODO: build pipeline numerical, name and categorical transformers'

num_pipeline = 'passthrough'

name_pipeline = 'passthrough'

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [38]:
print('Num features:', num_features)
print('Name features:', name_features)
print('Cat features:', cat_features)

Num features: ['Goal']
Name features: ['Name']
Cat features: ['Category', 'Subcategory', 'Country', 'Launched', 'Deadline']


In [None]:
#TODO: build preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features),
        ('name', name_pipeline, name_features),
        ('cat', cat_pipeline, cat_features)
    ],
    remainder='drop'
)