In [1]:
import pandas as pd
import numpy as np
import re
import lightgbm as lgb
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning, module='sklearn')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [2]:
#######################
# FEATURE ENGINEERING #
#######################
"""
Main function
Input: pandas Series and a feature engineering function
Output: pandas Series
"""
def engineer_feature(series, func, normalize=True):
    feature = series.apply(func)
       
    if normalize:
        feature = pd.Series(z_normalize(feature.values.reshape(-1,1)).reshape(-1,))
    feature.name = func.__name__ 
    return feature

"""
Engineer features
Input: pandas Series and a list of feature engineering functions
Output: pandas DataFrame
"""
def engineer_features(series, funclist, normalize=True):
    features = pd.DataFrame()
    for func in funclist:
        feature = engineer_feature(series, func, normalize)
        features[feature.name] = feature
    return features

In [3]:
"""
Normalizer
Input: NumPy array
Output: NumPy array
"""
scaler = StandardScaler()
def z_normalize(data):
    scaler.fit(data)
    return scaler.transform(data)

def count_regexp_occ(regexp="", text=None):
    """ Simple way to get the number of occurence of a regex"""
    return len(re.findall(regexp, text))
    
"""
Feature functions
"""
def asterix_freq(x):
    return x.count('!')/len(x)

def uppercase_freq(x):
    return len(re.findall(r'[A-Z]',x))/len(x)

def links(x):
    return len(re.findall("(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?",str(x)))

def row_text_len(x):
    return max(len(x.split()), 1)

def raw_char_len(x):
    return len(x)

# Number of F words - f..k contains folk, fork,
def fk_freq(x):
    return count_regexp_occ(r"[Ff]\S{2}[Kk]", x) / row_text_len(x)

# Number of S word
def s_freq(x):
    return count_regexp_occ(r"[Ss]\S{2}[Kk]", x) / row_text_len(x)

# Number of D words
def dk_freq(x):
    return count_regexp_occ(r"[dD]ick", x) / row_text_len(x)

# Number of occurence of You, insulting someone usually needs someone called : you
def you_freq(x):
    return count_regexp_occ(r"\W[Yy]ou\W", x) / row_text_len(x)

# Just to check you really refered to my mother ;-)
def mother_freq(x):
    return count_regexp_occ(r"\Wmother\W", x) / row_text_len(x)

# Just checking for toxic 19th century vocabulary
def ng_freq(x):
    return count_regexp_occ(r"\Wnigger\W", x) / row_text_len(x)

# Some Sentences start with a <:> so it may help
def start_with_columns_freq(x):
    return count_regexp_occ(r"^\:+", x) / row_text_len(x)

def _has_date_long(x):
    return count_regexp_occ(r"\D\d{2}:\d{2}, \d{1,2} \w+ \d{4}", x)

def _has_date_short(x):
    return count_regexp_occ(r"\D\d{1,2} \w+ \d{4}", x)

# Check for time stamp
def has_timestamp(x):
    return (_has_date_long(x) + _has_date_short(x) + count_regexp_occ(r"\d{2}|:\d{2}", x)) != 0

# check for mail
def has_mail(x):
    return count_regexp_occ(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', x) != 0

# Looking for words surrounded by == word == or """" word """"
def has_emphasize_equal(x):
    return count_regexp_occ(r"\={2}.+\={2}", x) != 0

def has_emphasize_quotes(x):
    return count_regexp_occ(r"\"{4}\S+\"{4}", x) != 0

In [4]:
"""
Import submission and OOF files
"""
def get_subs(names):
    subs = np.hstack([np.array(pd.read_csv("./submission_" + name + ".csv")[LABELS]) for name in names])
    oofs = np.hstack([np.array(pd.read_csv("./train_" + name + ".csv")[LABELS]) for name in names])
    return subs, oofs

In [16]:
train = pd.read_csv('./input/train_text_features.csv').fillna(' ')
test = pd.read_csv('./input/test_text_features.csv').fillna(' ')
sub = pd.DataFrame.from_dict({'id': test['id']})
INPUT_COLUMN = "comment_text"
LABELS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [17]:
new_text_features = ['capitals_vs_length', 'stopwords_vs_length', 'exclamation_marks_vs_length',
                     'unique_words_vs_length', 'repeated_words_vs_length', 'polarity_1st_sent', 
                     'subjectivity_1st_sent', 'polarity_last_sent', 'polarity', 'subjectivity',
                     'purity']

In [20]:
new_text_features_train = np.array(train[new_text_features])
new_text_features_test = np.array(test[new_text_features])

In [19]:
# Import submissions and OOF files
# gru_106: GRU trained on Fasttext (CV: 0.9893, LB: 0.9865)
# gru_107: GRU trained on Fasttext + SpartialDropout(CV: 0.9895, LB: ?)
# gru_110: GRU trained on Fasttext (CV: 0.9898, LB: 0.9865)

# lstm_100: LSTM trained on Fasttext (CV: 0.9890, LB: 0.9862)
# lstm_101: LSTM trained on Fasttext (CV: 0.9891, LB: ?)

# nb_logistic_regression_100: NB_LogisticRegression (CV: 0.9873, LB: ?)
# nb_logistic_regression_101: NB_LogisticRegression with stop words (CV: 0.9879, LB: 0.9823)

# textcnn_100: TextCNN (CV: 0.9790, LB: 0.9827)
# lgm_100: LigthGBM (CV: 0.9825, LB: 0.9807)

subnames = ['gru_110', 'lstm_101', 'nb_logistic_regression_101', 'lgm_100']
subs, oofs = get_subs(subnames)

In [23]:
# Engineer features
feature_functions = [len, asterix_freq, uppercase_freq, links, row_text_len, raw_char_len,
                     fk_freq, s_freq, dk_freq, you_freq, mother_freq, ng_freq, start_with_columns_freq,
                     has_timestamp, has_mail, has_emphasize_equal, has_emphasize_quotes]
features = [f.__name__ for f in feature_functions]
F_train = engineer_features(train[INPUT_COLUMN], feature_functions)
F_test = engineer_features(test[INPUT_COLUMN], feature_functions)

X_train = np.hstack([F_train[features].as_matrix(), oofs, new_text_features_train])
#X_test = np.hstack([F_test[features].as_matrix(), subs, new_text_features_test])    



In [28]:
train = pd.read_csv('./input/train.csv').fillna(' ')
test = pd.read_csv('./input/test.csv').fillna(' ')

In [None]:
def get_ligthgbmstacker():
    return lgb.LGBMClassifier(max_depth=3,
                              metric="auc",
                              n_estimators=125,
                              num_leaves=10,
                              boosting_type="gbdt",
                              learning_rate=0.1,
                              feature_fraction=0.45,
                              colsample_bytree=0.45,
                              bagging_fraction=0.8,
                              bagging_freq=5,
                              reg_lambda=0.2)

## CV

In [30]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
kf = KFold(n_splits=5, shuffle=True, random_state=0xCAFFE)

In [33]:
n_jobs = 1
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for class_name in class_names[1:2]:
    print(class_name)
    params = {
        'reg_lambda': [0.4],
        'max_depth': [5],
    }
    
    base_estimator = lgb.LGBMClassifier(metric="auc",
                                        n_estimators=125,
                                        boosting_type="gbdt",
                                        learning_rate=0.1,
                                        feature_fraction=0.8,
                                        num_leaves=20)
    gs = GridSearchCV(
        estimator=base_estimator, 
        param_grid=params,
        cv=kf,
        error_score=1,
        scoring='roc_auc',
        n_jobs=n_jobs,
        verbose=2,
    )

    # perform grid search on TRAIN dataset ('is_train' filtering)
    gs.fit(
        X=X_train,
        y=np.array(train[class_name]),
    )
    
    best_score = gs.best_score_
    best_estimator = gs.best_estimator_
    print('ROC-AUC best: {:.4f}'.format(best_score))
    print(best_estimator)

severe_toxic
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] max_depth=5, reg_lambda=0.4 .....................................
[CV] ...................... max_depth=5, reg_lambda=0.4, total=  56.7s
[CV] max_depth=5, reg_lambda=0.4 .....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   56.7s remaining:    0.0s


[CV] ...................... max_depth=5, reg_lambda=0.4, total=  56.8s
[CV] max_depth=5, reg_lambda=0.4 .....................................
[CV] ...................... max_depth=5, reg_lambda=0.4, total=  57.9s
[CV] max_depth=5, reg_lambda=0.4 .....................................
[CV] ...................... max_depth=5, reg_lambda=0.4, total=  55.2s
[CV] max_depth=5, reg_lambda=0.4 .....................................
[CV] ...................... max_depth=5, reg_lambda=0.4, total=  57.0s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.7min finished


ROC-AUC best: 0.9919
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        feature_fraction=0.8, learning_rate=0.1, max_depth=5, metric='auc',
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=125, n_jobs=-1, num_leaves=20, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.4, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=1)


In [None]:
def training(train_indices, val_indices, class_name, params):
    classifier = lgb.LGBMClassifier(metric="auc",
                                    boosting_type="gbdt",
                                    learning_rate=0.1,
                                    **params)
    
    x_train = X_train[train_indices]
    y_train = np.array(train[class_name])[train_indices]
    
    x_test = X_train[val_indices]
    y_test = np.array(train[class_name])[val_indices]
    
    classifier.fit(x_train, y_train)
    
    train_proba = classifier.predict_proba(x_train)[:, 1]
    val_proba = classifier.predict_proba(x_test)[:, 1]
    sub_proba = classifier.predict_proba(X_test)[:, 1]
    
    train_score = roc_auc_score(y_train, train_proba)
    val_score = roc_auc_score(y_test, val_proba)
    
    return train_score, val_score, val_proba, sub_proba, val_indices

In [None]:
submission = pd.DataFrame.from_dict({'id': test['id']})
train_submission = pd.DataFrame.from_dict({'id': train['id']})

In [None]:
predictors = 5

In [None]:
from tqdm import tqdm
import concurrent.futures

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

cv_params = [
    {
        'n_estimators': 125,
        'reg_lambda': 0.4,
        'max_depth': 3,
        'num_leaves': 20,
        'feature_fraction': 0.8,
    }
] * 6

scores = []
for i, class_name in enumerate(class_names):
    print('Class: %s' % class_name)
    
    sub_probas = np.zeros(shape=(len(test), ))
    train_probas = np.zeros(shape=(len(train), ))
    
    kf = KFold(n_splits=predictors, shuffle=True, random_state=0xCAFFE)
    
    train_scores, val_scores = [], []
    with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
        
        futures = (executor.submit(training, 
                                   train_indices, 
                                   val_indices,
                                   class_name,
                                   cv_params[i]) 
                   for train_indices, val_indices in kf.split(train))
        
        for future in concurrent.futures.as_completed(futures):
            train_score, val_score, val_proba, sub_proba, val_indices = future.result()
            train_scores.append(train_score)
            val_scores.append(val_score)
            
            train_probas[val_indices] += val_proba
            sub_probas += sub_proba / predictors
    
    scores.append(np.mean(val_scores))
    print('\tTrain ROC-AUC: %s' % np.mean(train_scores))
    print('\tVal ROC-AUC: %s' % np.mean(val_scores))
    
    submission[class_name] = sub_probas
    train_submission[class_name] = train_probas
    
print('Total: %s' % np.mean(scores))

In [None]:
submission.to_csv('submission_ensemble_007.csv', index=False)
train_submission.to_csv('train_ensemble_007.csv', index=False)