In [1]:
import pandas as pd
import numpy as np
import re
import lightgbm as lgb
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning, module='sklearn')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [2]:
#######################
# FEATURE ENGINEERING #
#######################
"""
Main function
Input: pandas Series and a feature engineering function
Output: pandas Series
"""
def engineer_feature(series, func, normalize=True):
    feature = series.apply(func)
       
    if normalize:
        feature = pd.Series(z_normalize(feature.values.reshape(-1,1)).reshape(-1,))
    feature.name = func.__name__ 
    return feature

"""
Engineer features
Input: pandas Series and a list of feature engineering functions
Output: pandas DataFrame
"""
def engineer_features(series, funclist, normalize=True):
    features = pd.DataFrame()
    for func in funclist:
        feature = engineer_feature(series, func, normalize)
        features[feature.name] = feature
    return features

In [3]:
"""
Normalizer
Input: NumPy array
Output: NumPy array
"""
scaler = StandardScaler()
def z_normalize(data):
    scaler.fit(data)
    return scaler.transform(data)
    
"""
Feature functions
"""
def asterix_freq(x):
    return x.count('!')/len(x)

def uppercase_freq(x):
    return len(re.findall(r'[A-Z]',x))/len(x)

In [4]:
"""
Import submission and OOF files
"""
def get_subs(names):
    subs = np.hstack([np.array(pd.read_csv("./submission_" + name + ".csv")[LABELS]) for name in names])
    oofs = np.hstack([np.array(pd.read_csv("./train_" + name + ".csv")[LABELS]) for name in names])
    return subs, oofs

In [5]:
train = pd.read_csv('./input/train.csv').fillna(' ')
test = pd.read_csv('./input/test.csv').fillna(' ')
sub = pd.DataFrame.from_dict({'id': test['id']})
INPUT_COLUMN = "comment_text"
LABELS = train.columns[2:]

In [6]:
# Import submissions and OOF files
# gru_106: GRU trained on Fasttext (CV: 0.9893, LB: 0.9865)
# lstm_100: LSTM trained on Fasttext (CV: 0.9890, LB: 0.9862)
# nb_logistic_regression_100: NB_LogisticRegression (CV: 0.9873, LB: ?)
# textcnn_100: TextCNN (CV: 0.9790, LB: ?)
subnames = ['gru_106', 'lstm_100', 'nb_logistic_regression_100', 'textcnn_100']
subs, oofs = get_subs(subnames)

In [7]:
# Engineer features
feature_functions = [len, asterix_freq, uppercase_freq]
features = [f.__name__ for f in feature_functions]
F_train = engineer_features(train[INPUT_COLUMN], feature_functions)
F_test = engineer_features(test[INPUT_COLUMN], feature_functions)

X_train = np.hstack([F_train[features].as_matrix(), oofs])
X_test = np.hstack([F_test[features].as_matrix(), subs])    



In [8]:
def get_ligthgbmstacker():
    return lgb.LGBMClassifier(max_depth=3,
                              metric="auc",
                              n_estimators=125,
                              num_leaves=10,
                              boosting_type="gbdt",
                              learning_rate=0.1,
                              feature_fraction=0.45,
                              colsample_bytree=0.45,
                              bagging_fraction=0.8,
                              bagging_freq=5,
                              reg_lambda=0.2)

In [9]:
def get_catbooststacker():
    from catboost import CatBoostClassifier
    return CatBoostClassifier(verbose=False)

In [10]:
stacker = get_ligthgbmstacker()
# Fit and submit
scores = []
for label in LABELS:
    print(label)
    score = cross_val_score(stacker, X_train, train[label], cv=5, scoring='roc_auc')
    print("AUC:", np.mean(score))
    scores.append(np.mean(score))
    stacker.fit(X_train, train[label])
    sub[label] = stacker.predict_proba(X_test)[:,1]
print("CV score:", np.mean(scores))

toxic
AUC: 0.988153335340402
severe_toxic
AUC: 0.9918707421210826
obscene
AUC: 0.9952986393184855
threat
AUC: 0.9928283419175848
insult
AUC: 0.989678506253948
identity_hate
AUC: 0.9902948739890981
CV score: 0.9913540731567668


In [10]:
sub.to_csv("submission_ensemble_005.csv", index=False)