In [None]:
import pandas as pd
import numpy as np
import re
import lightgbm as lgb
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning, module='sklearn')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [None]:
#######################
# FEATURE ENGINEERING #
#######################
"""
Main function
Input: pandas Series and a feature engineering function
Output: pandas Series
"""
def engineer_feature(series, func, normalize=True):
    feature = series.apply(func)
       
    if normalize:
        feature = pd.Series(z_normalize(feature.values.reshape(-1,1)).reshape(-1,))
    feature.name = func.__name__ 
    return feature

"""
Engineer features
Input: pandas Series and a list of feature engineering functions
Output: pandas DataFrame
"""
def engineer_features(series, funclist, normalize=True):
    features = pd.DataFrame()
    for func in funclist:
        feature = engineer_feature(series, func, normalize)
        features[feature.name] = feature
    return features

In [None]:
"""
Normalizer
Input: NumPy array
Output: NumPy array
"""
scaler = StandardScaler()
def z_normalize(data):
    scaler.fit(data)
    return scaler.transform(data)
    
"""
Feature functions
"""
def asterix_freq(x):
    return x.count('!')/len(x)

def uppercase_freq(x):
    return len(re.findall(r'[A-Z]',x))/len(x)

In [None]:
"""
Import submission and OOF files
"""
def get_subs(names):
    subs = np.hstack([np.array(pd.read_csv("./submission_" + name + ".csv")[LABELS]) for name in names])
    oofs = np.hstack([np.array(pd.read_csv("./train_" + name + ".csv")[LABELS]) for name in names])
    return subs, oofs

In [None]:
train = pd.read_csv('./input/train.csv').fillna(' ')
test = pd.read_csv('./input/test.csv').fillna(' ')
sub = pd.DataFrame.from_dict({'id': test['id']})
INPUT_COLUMN = "comment_text"
LABELS = train.columns[2:]

In [None]:
# Import submissions and OOF files
# gru_106: GRU trained on Fasttext (CV: 0.9893, LB: 0.9865)
# gru_107: GRU trained on Fasttext + SpartialDropout(CV: 0.9895, LB: ?)

# lstm_100: LSTM trained on Fasttext (CV: 0.9890, LB: 0.9862)
# lstm_101: LSTM trained on Fasttext (CV: 0.9891, LB: ?)

# nb_logistic_regression_100: NB_LogisticRegression (CV: 0.9873, LB: ?)
# nb_logistic_regression_101: NB_LogisticRegression with stop words (CV: 0.9879, LB: ?)

# textcnn_100: TextCNN (CV: 0.9790, LB: ?)
# lgm_100: LigthGBM (CV: 0.9825, LB: ?)

subnames = ['gru_108', 'lstm_101', 'nb_logistic_regression_101', 'textcnn_100', 'lgm_100']
subs, oofs = get_subs(subnames)

In [7]:
# Engineer features
feature_functions = [len, asterix_freq, uppercase_freq]
features = [f.__name__ for f in feature_functions]
F_train = engineer_features(train[INPUT_COLUMN], feature_functions)
F_test = engineer_features(test[INPUT_COLUMN], feature_functions)

X_train = np.hstack([F_train[features].as_matrix(), oofs])
X_test = np.hstack([F_test[features].as_matrix(), subs])    



In [None]:
def get_ligthgbmstacker():
    return lgb.LGBMClassifier(max_depth=3,
                              metric="auc",
                              n_estimators=125,
                              num_leaves=10,
                              boosting_type="gbdt",
                              learning_rate=0.1,
                              feature_fraction=0.45,
                              colsample_bytree=0.45,
                              bagging_fraction=0.8,
                              bagging_freq=5,
                              reg_lambda=0.2)

## CV

In [8]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
kf = KFold(n_splits=5, shuffle=True, random_state=0xCAFFE)

In [None]:
n_jobs = 1

for class_name in class_names:
    print(class_name)
    params = {
        'reg_lambda': np.arange(0., 0.5, 0.1),
        'max_depth': np.arange(2, 7),
    }
    
    base_estimator = lgb.LGBMClassifier(metric="auc",
                                        n_estimators=125,
                                        boosting_type="gbdt",
                                        learning_rate=0.1,
                                        feature_fraction=0.8,
                                        num_leaves=20)
    gs = GridSearchCV(
        estimator=base_estimator, 
        param_grid=params,
        cv=kf,
        error_score=1,
        scoring='roc_auc',
        n_jobs=n_jobs,
        verbose=2,
    )

    # perform grid search on TRAIN dataset ('is_train' filtering)
    gs.fit(
        X=X_train,
        y=np.array(train[class_name]),
    )
    
    best_score = gs.best_score_
    best_estimator = gs.best_estimator_
    print('ROC-AUC best: {:.4f}'.format(best_score))
    print(best_estimator)

In [9]:
def training(train_indices, val_indices, class_name, params):
    classifier = lgb.LGBMClassifier(metric="auc",
                                    boosting_type="gbdt",
                                    learning_rate=0.1,
                                    **params)
    
    x_train = X_train[train_indices]
    y_train = np.array(train[class_name])[train_indices]
    
    x_test = X_train[val_indices]
    y_test = np.array(train[class_name])[val_indices]
    
    classifier.fit(x_train, y_train)
    
    train_proba = classifier.predict_proba(x_train)[:, 1]
    val_proba = classifier.predict_proba(x_test)[:, 1]
    sub_proba = classifier.predict_proba(X_test)[:, 1]
    
    train_score = roc_auc_score(y_train, train_proba)
    val_score = roc_auc_score(y_test, val_proba)
    
    return train_score, val_score, val_proba, sub_proba, val_indices

In [10]:
submission = pd.DataFrame.from_dict({'id': test['id']})
train_submission = pd.DataFrame.from_dict({'id': train['id']})

In [11]:
predictors = 5

In [12]:
from tqdm import tqdm
import concurrent.futures

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

cv_params = [
    {
        'n_estimators': 125,
        'reg_lambda': 0.4,
        'max_depth': 3,
        'num_leaves': 20,
        'feature_fraction': 0.8,
    }
] * 6

scores = []
for i, class_name in enumerate(class_names):
    print('Class: %s' % class_name)
    
    sub_probas = np.zeros(shape=(len(test), ))
    train_probas = np.zeros(shape=(len(train), ))
    
    kf = KFold(n_splits=predictors, shuffle=True, random_state=0xCAFFE)
    
    train_scores, val_scores = [], []
    with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
        
        futures = (executor.submit(training, 
                                   train_indices, 
                                   val_indices,
                                   class_name,
                                   cv_params[i]) 
                   for train_indices, val_indices in kf.split(train))
        
        for future in concurrent.futures.as_completed(futures):
            train_score, val_score, val_proba, sub_proba, val_indices = future.result()
            train_scores.append(train_score)
            val_scores.append(val_score)
            
            train_probas[val_indices] += val_proba
            sub_probas += sub_proba / predictors
    
    scores.append(np.mean(val_scores))
    print('\tTrain ROC-AUC: %s' % np.mean(train_scores))
    print('\tVal ROC-AUC: %s' % np.mean(val_scores))
    
    submission[class_name] = sub_probas
    train_submission[class_name] = train_probas
    
print('Total: %s' % np.mean(scores))

Class: toxic
	Train ROC-AUC: 0.9897111782733923
	Val ROC-AUC: 0.9882707862034122
Class: severe_toxic
	Train ROC-AUC: 0.9940037302119296
	Val ROC-AUC: 0.9918370525766076
Class: obscene
	Train ROC-AUC: 0.9961298543510173
	Val ROC-AUC: 0.9953349240922534
Class: threat
	Train ROC-AUC: 0.9981861585637823
	Val ROC-AUC: 0.9932352425049469
Class: insult
	Train ROC-AUC: 0.9911852193608794
	Val ROC-AUC: 0.9895208183137179
Class: identity_hate
	Train ROC-AUC: 0.9946782463333145
	Val ROC-AUC: 0.9908986432074913
Total: 0.9915162444830715


In [None]:
submission.to_csv('submission_ensemble_006.csv', index=False)
train_submission.to_csv('train_ensemble_006.csv', index=False)

In [None]:
stacker = get_ligthgbmstacker()
# Fit and submit
scores = []
for label in LABELS:
    print(label)
    score = cross_val_score(stacker, X_train, train[label], cv=5, scoring='roc_auc')
    print("AUC:", np.mean(score))
    scores.append(np.mean(score))
    stacker.fit(X_train, train[label])
    sub[label] = stacker.predict_proba(X_test)[:,1]
print("CV score:", np.mean(scores))

In [None]:
sub.to_csv("submission_ensemble_005.csv", index=False)