In [1]:
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split, cross_validate, KFold
import seaborn as sns
from lightgbm.sklearn import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score, accuracy_score
from sklearn.metrics import make_scorer

from sklearn.linear_model import LinearRegression, LogisticRegression, ElasticNet
from sklearn.naive_bayes import BernoulliNB

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.tokenize.casual import TweetTokenizer
from hashlib import md5
import numpy as np

%matplotlib notebook

mse = make_scorer(mse)
mae = make_scorer(mae)
r2_score = make_scorer(r2_score)
accuracy_score = make_scorer(accuracy_score)

In [2]:
from mlxtend.regressor import StackingRegressor, StackingCVRegressor
from mlxtend.classifier import StackingClassifier

In [33]:
data = pd.read_csv("X_train.csv")

def preprocess_data(data):
    def func(str_):
        global l
        l = eval(str_)
        dict_ = dict(zip(*
            itertools.chain.from_iterable((x.keys(), x.values()) for x in l)
        ))
        return dict_

    data["property"] = data["property"].apply(lambda str_: dict((list(x.keys())[0], list(x.values())[0]) for x in eval(str_)))
    data["property"] = data["property"].apply(lambda x: " ".join("_".join((str(k), v)) for k, v in x.items()))
    data = pd.get_dummies(data, columns=['brandId', 'categoryLevel1Id', 'categoryLevel2Id'])
    
    data["commentNegative"].fillna("EMPTY", inplace=True)
    data["commentPositive"].fillna("EMPTY", inplace=True)

    if True:
        l = [data]

        for col in ["comment"]:# , "commentNegative", "commentPositive"]:
            for i, vectorizer in enumerate([
                TfidfVectorizer(tokenizer=TweetTokenizer(False).tokenize, ngram_range=(1, 2), min_df=0.01),
                TfidfVectorizer(analyzer="char", ngram_range=(1, 4), min_df=0.01)
            ]):
                count = vectorizer.fit_transform(data[col])

                l.append(
                    pd.DataFrame(
                        count.A,
                        columns=[
                            col + " " + str(i) + " " +
                            word.replace("<", "").replace(">", "").replace("[", "").replace("]", "")
                            for word in vectorizer.get_feature_names()
                        ]
                    )
                )

            data.drop(col, axis=1, inplace=True)
        data = pd.concat(l, axis=1)
    
    target = data['reting']
    data["date"] = pd.to_datetime(data["date"]).astype(int)
    data.drop(['userName', 'reting', 'sku', "commentNegative", "commentPositive", "property"], inplace=True, axis=1)
    return (data, target)
    
train_data, train_target = preprocess_data(data)

In [34]:
train_data.shape

(15587, 11310)

In [35]:
train_target -= 1
train_target /= 4

In [36]:
regs = [LGBMRegressor(objective="mse"), XGBRegressor(objective="reg:logistic")]
reg = StackingRegressor(regs, XGBRegressor(objective="reg:logistic"))

scoring = {
    "MSE": mse,
    "MAE": mae,
    "r2_score": r2_score
}

cv = cross_validate(reg, train_data, train_target, scoring=scoring, cv=KFold(10), verbose=3)

cv = {k: (v.mean(), v.std()) for k, v in cv.items()}
cv

[CV]  ................................................................
[CV]  , MSE=3.7764432841582077e-07, MAE=0.00027137020654496066, r2_score=0.9999964219685219, total= 6.9min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  7.7min remaining:    0.0s


[CV]  ................................................................


KeyboardInterrupt: 