In [3]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import re
import string
import random


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.metrics import accuracy_score, recall_score, plot_confusion_matrix
#from wordcloud import WordCloud
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

from xgboost import XGBClassifier


import warnings

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from sklearn.impute import SimpleImputer
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from pathlib import Path

# import joblib
import pickle

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ivan4\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
df = pd.read_csv('kickstarter_data_with_features.csv',index_col=0)

In [6]:
df = df[['name', 'goal', 'blurb', 'launched_at', 'deadline','category','state', 'country']] 
english_countries = ['US', 'IE', 'GB', 'AU', 'CA', 'NZ', ]

In [7]:
df= df[df['country'].isin(english_countries)]

In [8]:
suc_filt = ['failed', 'successful']
df= df[df['state'].isin(suc_filt)]
df['state'] = df['state'].replace({'failed': 0, 'successful': 1})

In [9]:
columns = ['name','blurb', 'state']
to_df = df.copy()
to_df = to_df[columns]  
to_df.fillna(' ', inplace=True)
to_df['text']=to_df['name']+' '+to_df['blurb']

In [10]:
punctuations = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS
parser = English()

In [11]:
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    return mytokens

In [12]:
class predictors(TransformerMixin):

    def transform(self, X, **transform_params):
            
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

In [19]:
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [13]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,3))

In [14]:
train, test = train_test_split(to_df, train_size=0.80, test_size=0.20, 
                                stratify= to_df['state'], random_state=3)

train, val = train_test_split(train, train_size=0.80, test_size=0.20, 
                                stratify= train['state'], random_state=3)

In [15]:
features = 'text'
target = 'state'
X_train = train[features]
X_val = val[features]
X_test = test[features]
y_train = train[target]
y_val = val[target]
y_test = test[target]

In [16]:
xgm = XGBClassifier(n_jobs=-1, max_depth=200, learning_rate=0.2, min_child_weight=5, )

In [17]:
pipe = Pipeline([("cleaner", predictors()),
                    ('vectorizer', bow_vector),
                    ('classifier', xgm)])

In [20]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('cleaner', <__main__.predictors object at 0x0000014F06B0E430>),
                ('vectorizer',
                 CountVectorizer(ngram_range=(1, 3),
                                 tokenizer=<function spacy_tokenizer at 0x0000014F03B8F4C0>)),
                ('classifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='', learning_rate=0.2,
                               max_delta_step=0, max_depth=200,
                               min_child_weight=5, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=-1, num_parallel_tree=1, random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       

In [21]:
pipe_pred = pipe.predict(X_test)

In [23]:
accuracy_score(pipe_pred,y_test)

0.7027883396704689

In [25]:
pickle.dump(pipe, open('pickle_model.pkl', 'wb'))