In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from os.path import dirname
import datetime as dt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_predict


notebook_path = os.path.abspath("DataAnalyticsKickstarterNotebook_Cedrik.ipynb")
csv_path = os.path.join(os.path.dirname(dirname(notebook_path)), "data/ks-project-edited-merged.csv")
csv_path_oneProject = os.path.join(os.path.dirname(dirname(notebook_path)), "data/ks-project-edited-oneProject.csv")
csv_path_multipleProjects = os.path.join(os.path.dirname(dirname(notebook_path)), "data/ks-project-edited-multipleProjects.csv")

In [2]:
df = pd.read_csv (csv_path, low_memory=False)
df_oneProject = pd.read_csv (csv_path_oneProject, low_memory=False)
df_multipleProjects = pd.read_csv (csv_path_multipleProjects, low_memory=False)

df["name_length"] = df["name"].str.len()

df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,duration,creator_id,name_length
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,59,753774991,31
1,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,successful,224,US,52375.0,52375.0,50000.0,35,362504450,20
2,1000030581,Chaser Strips. Our Strips make Shots their B*tch!,Drinks,Food,USD,2016-03-17,25000.0,2016-02-01 20:05:12,453.0,failed,40,US,453.0,453.0,25000.0,45,1295394884,49
3,100005484,Lisa Lim New CD!,Indie Rock,Music,USD,2013-04-08,12500.0,2013-03-09 06:42:58,12700.0,successful,100,US,12700.0,12700.0,12500.0,30,1116977628,16
4,1000081649,MikeyJ clothing brand fundraiser,Childrenswear,Fashion,AUD,2017-09-07,2500.0,2017-08-08 01:20:20,1.0,failed,1,AU,0.0,0.81,2026.1,30,1942626789,32


In [3]:
df["creator_type"] = 3
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,duration,creator_id,name_length,creator_type
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,59,753774991,31,3
1,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,successful,224,US,52375.0,52375.0,50000.0,35,362504450,20,3
2,1000030581,Chaser Strips. Our Strips make Shots their B*tch!,Drinks,Food,USD,2016-03-17,25000.0,2016-02-01 20:05:12,453.0,failed,40,US,453.0,453.0,25000.0,45,1295394884,49,3
3,100005484,Lisa Lim New CD!,Indie Rock,Music,USD,2013-04-08,12500.0,2013-03-09 06:42:58,12700.0,successful,100,US,12700.0,12700.0,12500.0,30,1116977628,16,3
4,1000081649,MikeyJ clothing brand fundraiser,Childrenswear,Fashion,AUD,2017-09-07,2500.0,2017-08-08 01:20:20,1.0,failed,1,AU,0.0,0.81,2026.1,30,1942626789,32,3


In [4]:
for key, row in df_oneProject.iterrows():
        creator_index = df.index[df['creator_id'] == row["creator_id"]].tolist()
        df.at[creator_index[0],'creator_type']= 0
        #df_features[["creator_type"]] = df_features[["creator_type"]].replace(3, 0)

In [5]:
df = df.sort_values(by=["creator_id","launched"])

In [6]:
for key, row in df_multipleProjects.iterrows():
        firstProject = df[df["creator_id"] == row["creator_id"]].iloc[0]
        creator_index = df.index[df['creator_id'] == row["creator_id"]].tolist()
        
        if (firstProject["state"] == "successful"):
            for i in range(len(creator_index)):
                df.at[creator_index[i],'creator_type']= 1
        else:
            for i in range(len(creator_index)):
                df.at[creator_index[i],'creator_type']= 2

In [7]:
df["creator_type"].value_counts()

0    100219
1     20242
2      9119
Name: creator_type, dtype: int64

In [8]:
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,duration,creator_id,name_length,creator_type
100166,558744194,Feltron vs Kickstarter,Graphic Design,Design,USD,2009-10-14,3000.0,2009-09-24 07:33:14,3222.0,successful,132,US,3222.0,3222.0,3000.0,20,3,22,1
23118,134175247,Center for Lost Arts – a document,Public Art,Art,USD,2015-07-01,5000.0,2015-06-16 22:02:59,11486.0,successful,190,US,11486.0,11486.0,5000.0,15,3,33,1
39115,1577455391,This is also not a Kickstarter shirt,Apparel,Fashion,USD,2014-05-29,500.0,2014-05-15 00:17:37,16167.71,successful,604,US,16167.71,16167.71,500.0,14,8,36,1
48686,171893227,Invisible Courts,Zines,Publishing,USD,2017-01-12,50.0,2017-01-04 19:28:57,207.66,successful,104,US,113.0,207.66,50.0,8,8,16,1
96300,499552311,Kind of Bloop: An 8-Bit Tribute to Miles Davis,Jazz,Music,USD,2009-08-01,2000.0,2009-05-12 20:50:44,8647.79,successful,419,US,8647.79,8647.79,2000.0,81,9,46,1


In [9]:
df_usd_pledged_real = df["usd_pledged_real"].copy()
df_usd_pledged_real.head()

100166     3222.00
23118     11486.00
39115     16167.71
48686       207.66
96300      8647.79
Name: usd_pledged_real, dtype: float64

In [10]:
list_features = ["usd_goal_real","duration","name_length", "creator_type", "main_category"]
df_features = df[list_features].copy()
df_state = df[["ID","state"]].copy()

In [11]:
df_features = pd.concat([df_features.drop('main_category', axis=1), pd.get_dummies(df_features['main_category'])], axis=1)
true_labels = pd.concat([df_state.drop('state', axis=1), pd.get_dummies(df_state['state'])], axis=1)
true_labels.drop(columns=["failed","canceled","live"], inplace=True)

In [13]:
df_features.to_csv(os.path.join(os.path.dirname(dirname(notebook_path)), "data/ks-project-edited-klassifikation-features-mitPledged.csv"),index=False)

In [12]:
df_features.head()

Unnamed: 0,usd_goal_real,duration,name_length,creator_type,usd_pledged_real,Art,Comics,Crafts,Dance,Design,Fashion,Film & Video,Food,Games,Journalism,Music,Photography,Publishing,Technology,Theater
100166,3000.0,20,22,1,3222.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
23118,5000.0,15,33,1,11486.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
39115,500.0,14,36,1,16167.71,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
48686,50.0,8,16,1,207.66,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
96300,2000.0,81,46,1,8647.79,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [48]:
true_labels.head()

Unnamed: 0,ID,successful
100166,558744194,1
23118,134175247,1
39115,1577455391,1
48686,171893227,1
96300,499552311,1


In [52]:
scaler = MinMaxScaler()
list_scaler = ["usd_goal_real","duration","name_length"]
for feature in list_scaler:
    scaler.fit(df_features[[feature]])
    df_features[feature] = scaler.fit_transform(df_features[[feature]])
        

In [53]:
df_features.head()

Unnamed: 0,usd_goal_real,duration,name_length,creator_type,Art,Comics,Crafts,Dance,Design,Fashion,Film & Video,Food,Games,Journalism,Music,Photography,Publishing,Technology,Theater
100166,1.981553e-05,0.208791,0.221053,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
23118,3.302593e-05,0.153846,0.336842,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
39115,3.302534e-06,0.142857,0.368421,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
48686,3.301939e-07,0.076923,0.157895,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
96300,1.321033e-05,0.879121,0.473684,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


# Klassifikation mit SVM

In [15]:
X_train, X_dev, y_train, y_dev = train_test_split(df_features, true_labels["successful"], random_state=42)

In [16]:
# Pipeline neu definieren - ohne Parametersetzen 

feature_selection=SelectFromModel(LinearSVC(penalty="l1", dual=False))
classifier=SVC()

pipeline = Pipeline([('feature_selection', feature_selection),
                     ('classifier', classifier)])

# Parameterraum definieren: key ist schrittname__parametername, value die zu prüfenden Werte

parameters = {  
    'feature_selection__threshold': (None, 'mean'), 
    'classifier__kernel': ('linear','rbf')
}

# Suche über den gesamten Parameterraum (cross validation über die Trainingsdaten)
grid_search = GridSearchCV(pipeline, param_grid=parameters, verbose=10)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5; 1/4] START classifier__kernel=linear, feature_selection__threshold=None
[CV 1/5; 1/4] END classifier__kernel=linear, feature_selection__threshold=None;, score=0.616 total time=141.7min
[CV 2/5; 1/4] START classifier__kernel=linear, feature_selection__threshold=None
[CV 2/5; 1/4] END classifier__kernel=linear, feature_selection__threshold=None;, score=0.610 total time=30.6min
[CV 3/5; 1/4] START classifier__kernel=linear, feature_selection__threshold=None
[CV 3/5; 1/4] END classifier__kernel=linear, feature_selection__threshold=None;, score=0.619 total time=977.3min
[CV 4/5; 1/4] START classifier__kernel=linear, feature_selection__threshold=None
[CV 4/5; 1/4] END classifier__kernel=linear, feature_selection__threshold=None;, score=0.613 total time=30.2min
[CV 5/5; 1/4] START classifier__kernel=linear, feature_selection__threshold=None
[CV 5/5; 1/4] END classifier__kernel=linear, feature_selection__threshold=None;, scor



[CV 1/5; 4/4] END classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.575 total time=11.5min
[CV 2/5; 4/4] START classifier__kernel=rbf, feature_selection__threshold=mean...
[CV 2/5; 4/4] END classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.575 total time=12.0min
[CV 3/5; 4/4] START classifier__kernel=rbf, feature_selection__threshold=mean...
[CV 3/5; 4/4] END classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.576 total time=15.3min
[CV 4/5; 4/4] START classifier__kernel=rbf, feature_selection__threshold=mean...
[CV 4/5; 4/4] END classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.574 total time=16.8min
[CV 5/5; 4/4] START classifier__kernel=rbf, feature_selection__threshold=mean...
[CV 5/5; 4/4] END classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.575 total time=15.2min


GridSearchCV(estimator=Pipeline(steps=[('feature_selection',
                                        SelectFromModel(estimator=LinearSVC(dual=False,
                                                                            penalty='l1'))),
                                       ('classifier', SVC())]),
             param_grid={'classifier__kernel': ('linear', 'rbf'),
                         'feature_selection__threshold': (None, 'mean')},
             verbose=10)

In [17]:
print(grid_search.best_estimator_)

Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classifier', SVC())])


In [18]:
# Pipeline für die beste Feature-Kombination definieren
final_pipeline = Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classifier', SVC())])

# Wie gut ist der fertige Lerner auf den Trainingsdaten? 
# Evaluation per Crossvalidation (analog zur Parametersuche)
# Mit cross_val_predict merken wir uns die Vorhersage für jeden Datenpunkt, die gemacht wird, wenn er zum Testset 
# gehört; die Vorhersagen sind also ungesehen und liegen für den gesamten Datensatz vor.

train_labels = cross_val_predict(final_pipeline, X_train, y_train, cv=10)

# Precision/Recall/F-Wert berechnen

print(classification_report(y_train, train_labels))

              precision    recall  f1-score   support

           0       0.69      0.69      0.69     46451
           1       0.72      0.71      0.71     50734

    accuracy                           0.70     97185
   macro avg       0.70      0.70      0.70     97185
weighted avg       0.70      0.70      0.70     97185



In [20]:
from sklearn.metrics import classification_report, confusion_matrix

# Jetzt den Lerner ein letztes Mal auf allen Trainingsdaten trainieren und dann auf den Testdaten evaluieren

# Lerner auf den gesamten Trainingsdaten trainieren
final_pipeline.fit(X_train, y_train)

# Lerner auf den Testdaten evaluieren

# Mit dem default score des Lerners: (durchschnittliche Accuracy bei SVC)

print("Default-Score des Klassifizierers: Accuracy=",final_pipeline.score(X_dev, y_dev), "\n")

# Labels vorhersagen lassen und dann Precision/Recall/F-Wert berechnen
test_labels = final_pipeline.predict(X_dev)

print(classification_report(y_dev, test_labels))

Default-Score des Klassifizierers: Accuracy= 0.7031023306065751 

              precision    recall  f1-score   support

           0       0.68      0.69      0.69     15302
           1       0.72      0.71      0.72     17093

    accuracy                           0.70     32395
   macro avg       0.70      0.70      0.70     32395
weighted avg       0.70      0.70      0.70     32395



In [21]:
pd.crosstab(y_dev, test_labels, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,10569,4733,15302
1,4885,12208,17093
All,15454,16941,32395


Unsere fehlgeschlagenen Projekte (0) wurden von unserem Klassifizierer zu 69% richtig vorhergesagt und unsere erfolgreichen Projekte (1) zu 71%.

In [54]:
df_features.to_csv(os.path.join(os.path.dirname(dirname(notebook_path)), "data/ks-project-edited-klassifikation-features.csv"),index=False)
true_labels.to_csv(os.path.join(os.path.dirname(dirname(notebook_path)), "data/ks-project-edited-klassifikation-target.csv"),index=False)
df_usd_pledged_real.to_csv(os.path.join(os.path.dirname(dirname(notebook_path)), "data/ks-project-edited-regression-target.csv"),index=False)