# Starting Kit

In [1]:
from jyquickhelper import add_notebook_menu
add_notebook_menu(first_level=1)

## Modules

In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec

from gensim.models import Doc2Vec
from scipy.spatial import distance
from nltk.corpus import stopwords
from gensim.models.doc2vec import LabeledSentence

from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor

import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import OneHotEncoder
from scipy.spatial import distance
from sklearn.base import BaseEstimator

from problem import get_train_data, get_test_data
from problem import metric_report

##  Load and clean data

In [3]:
data = pd.read_csv('../kickstarter-bis/data/test.csv')
data = data.dropna(subset=['name'])

data.index = np.arange(0, len(data))

In [4]:
## To do before in the "problem.py"

labels = data['pledged']
data.drop(['pledged', 'state', 'usd_pledged_real', 'pledged', 'usd pledged', 'backers'], 
          axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3)

X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

## Feature extractor

In [5]:
# remove dashes and apostrophes from punctuation marks 
punct = string.punctuation.replace('-', '').replace("'",'')
# regex to match intra-word dashes and intra-word apostrophes
my_regex = re.compile(r"(\b[-']\b)|[\W_]")

def clean_string(string, punct=punct, my_regex=my_regex, to_lower=False):
    if to_lower:
        string = string.lower()
    # remove formatting
    str = re.sub('\s+', ' ', string)
     # remove punctuation
    str = ''.join(l for l in str if l not in punct)
    # remove dashes that are not intra-word
    str = my_regex.sub(lambda x: (x.group(1) if x.group(1) else ' '), str)
    # strip extra white space
    str = re.sub(' +',' ',str)
    # strip leading and trailing white space
    str = str.strip()
    return str

In [6]:
class FeatureExtractor(object):
    def __init__(self):
        pass

    def fit(self, X, y):
        
        #### NLP BASICS ####
        names = X['name'].tolist()
        cleaned_project_names = []
        
        for idx, doc in enumerate(names):
            # clean
            doc = clean_string(doc, punct, my_regex, to_lower=True)
            # tokenize (split based on whitespace)
            tokens = doc.split(' ')
            # remove digits
            tokens = [''.join([elt for elt in token if not elt.isdigit()]) for token in tokens]
            # remove tokens shorter than 3 characters in size
            tokens = [token for token in tokens if len(token) > 1]
            # remove tokens exceeding 25 characters in size
            tokens = [token for token in tokens if len(token) <= 25]
            cleaned_project_names.append(tokens)
            
        ##### Word2vect embedding
        self.model = Word2Vec(cleaned_project_names, min_count=1, size=100, workers=8)
        
        return self

    def transform(self, X):
        data = X.copy()
        
        #### SIMPLE TRANSFORMATION #### 
        
        data['launched_date'] = pd.to_datetime(data['launched'], format='%Y-%m-%d %H:%M:%S')
        data['deadline_date'] = pd.to_datetime(data['deadline'], format='%Y-%m-%d %H:%M:%S')
        
        # Length of project
        data['length'] = data['deadline_date'] - data['launched_date']
        data['length'] = [d.days for d in data['length']]
        
        # Features with month and year of launch
        data['year'] = [d.year for d in data['launched_date']]
        data['month'] = [d.month for d in data['launched_date']]
        data['day'] = [d.day for d in data['launched_date']]
        
        # Length of name
        data['name_length'] = [len(name) for name in data['name']]

        # Number of words
        data['word_number'] = [len(name.split(' ')) for name in data['name']]

        # Ponctuation
        data['question'] = (data.name.str[-1] == '?').astype(int)
        data['exclamation'] = (data.name.str[-1] == '!').astype(int)

        # Upper
        data['uppercase'] = data.name.str.isupper().astype(float)
        
        # Create dummies for categorical features
        main_category = pd.get_dummies(data['main_category'],prefix='mc')
        category = pd.get_dummies(data['category'], prefix = 'cat')
        country = pd.get_dummies(data['country'], prefix = 'country')
        currency = pd.get_dummies(data['currency'], prefix = 'currency')

        data = pd.concat([data, main_category, category, country, currency], axis=1)
        
        # Drop several features
        names = data['name'].tolist()

        features_to_drop =['main_category', 'category', 'country', 'currency', 'name',
                           'deadline', 'deadline_date', 'launched_date', 'launched',
                           'usd_goal_real', 'ID']
        data.drop(features_to_drop, axis=1, inplace=True)

        #### NLP BASICS ####
        cleaned_project_names = []
        
        for idx, doc in enumerate(names):
            # clean
            doc = clean_string(doc, punct, my_regex, to_lower=True)
            # tokenize (split based on whitespace)
            tokens = doc.split(' ')
            # remove digits
            tokens = [''.join([elt for elt in token if not elt.isdigit()]) for token in tokens]
            # remove tokens shorter than 3 characters in size
            tokens = [token for token in tokens if len(token) > 1]
            # remove tokens exceeding 25 characters in size
            tokens = [token for token in tokens if len(token) <= 25]
            cleaned_project_names.append(tokens)
            
        name_matrix = np.zeros((len(cleaned_project_names), 100), dtype="float32")

        for i in range(len(cleaned_project_names)):
            try:
                name_matrix[i,]= self.model.wv[cleaned_project_names[i]].sum(0) / len(cleaned_project_names[i]) 
            except:
                pass
                
        name_embeddings = pd.DataFrame(name_matrix)
        
        data = pd.concat([data, name_embeddings], axis=1)
        
        return data

## Regressor

In [7]:
class Regressor(BaseEstimator):
    
    def __init__(self):
        self.model = LGBMRegressor()

    def fit(self, X, y):
        self.model.fit(X, y)

    def predict(self, X):
        prediction = self.model.predict(X)
        return np.maximum(prediction, np.zeros(prediction.shape[0]))

## Model

In [8]:
feature_extractor = FeatureExtractor()

feature_extractor.fit(X_train, y_train)

X_train = feature_extractor.transform(X_train)
X_test = feature_extractor.transform(X_test)

In [9]:
lgb_regressor = Regressor()

lgb_regressor.fit(X_train, y_train)

y_pred = lgb_regressor.predict(X_test)

## Metrics

In [10]:
metric_report(X_test, y_true=y_test, y_pred=y_pred)

-------- REGRESSION METRICS --------

RMSE: 78447.00
MAE: 13146.39

-------- CLASSIFICATION METRICS --------

Accuracy: 0.65
Precision: 0.49
Recall: 0.39
