In this competition , we are asked to predict the toxicity rating of the comments taking into consideration the bias.Lets try it out.

In [None]:
import numpy as np
import pandas as pd
import os
import time
import warnings
from tqdm import tqdm
import gc
warnings.filterwarnings('ignore')

In [None]:
os.listdir("../input")

In [None]:
%%time

train=pd.read_csv("../input/toxic-comment-clean/train_cleaned.csv")
test=pd.read_csv("../input/toxic-comment-clean/test_cleaned.csv")
submission=pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv")

In [None]:
train.head()

In [None]:
test.head()

We use the fasttext embeddings here .All codes are borrowed/inspired from theo veils kernel -https://www.kaggle.com/theoviel/improve-your-score-with-some-text-preprocessing

In [None]:
EMB_PATH='../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'

In [None]:
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')

def load_embed(embed_dir=EMB_PATH):
    embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in tqdm(open(embed_dir)))
    gc.collect()
    return embedding_index

# def build_vocab(texts):
#     sentences = texts.apply(lambda x: x.split()).values
#     vocab = {}
#     for sentence in sentences:
#         for word in sentence:
#             try:
#                 vocab[word] += 1
#             except KeyError:
#                 vocab[word] = 1
#     return vocab

# def check_coverage(vocab, embeddings_index):
#     known_words = {}
#     unknown_words = {}
#     nb_known_words = 0
#     nb_unknown_words = 0
#     for word in vocab.keys():
#         try:
#             known_words[word] = embeddings_index[word]
#             nb_known_words += vocab[word]
#         except:
#             unknown_words[word] = vocab[word]
#             nb_unknown_words += vocab[word]
#             pass

#     print('Found embeddings for {:.2%} of vocab'.format(len(known_words) / len(vocab)))
#     print('Found embeddings for  {:.2%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
#     unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

#     return unknown_words


In [None]:
### Loading the embedding,

embed =load_embed()

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
        
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
train=reduce_mem_usage(train)

In [None]:
# vocab=build_vocab(train['comment_text'])

In [None]:
# oov_fasttext=check_coverage(vocab,embed)

Lets get to modelling  . Inspired from this kernel - https://www.kaggle.com/thousandvoices/simple-lstm

In [None]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, Dropout, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn import metrics
from keras.callbacks import ModelCheckpoint, TensorBoard, Callback, EarlyStopping

In [None]:
# del(vocab,vocab_test,oov_fasttext,oof_fasttext_test)
# gc.collect()

In [None]:
def build_matrix(word_index,embedding_index):
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix

In [None]:
identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
for col in identity_columns + ['target']:
    print(f'\n Converting {col} to boolean')
    train[col]=np.where(train[col]>=0.5,True,False)




In [None]:
# train.head()

In [None]:
#x_train=train['comment_treated']
y_train=np.where(train['target']>=0.5,True,False)*1
#y_aux_train = train[['target', 'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish','muslim', 'black', 'white', 'psychiatric_or_mental_illness']]
#x_test=test['comment_treated']


In [None]:
# print(f'Shape of x_train {x_train.shape} \n Shape of y_train {y_train.shape} \n Shape of x_test {x_test.shape}'
# )

In [None]:
%%time

tokenizer = text.Tokenizer(num_words=90000)
tokenizer.fit_on_texts(list(train['comment_treated'])+list(test['comment_treated']))


In [None]:
MAX_LEN=220
LSTM_UNITS=128
NUM_MODELS = 1
BATCH_SIZE = 512
DENSE_HIDDEN_UNITS = 2 * LSTM_UNITS
EPOCHS = 5

In [None]:
embedding_matrix=build_matrix(tokenizer.word_index,embed)

In [None]:
### Defining the metrics:
### https://www.kaggle.com/dborkan/benchmark-kernel/
SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return metrics.roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, model_name):
    subgroup_examples = df[df[subgroup]]
    return compute_auc(subgroup_examples[label], subgroup_examples[model_name])

def compute_bpsn_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[df[subgroup] & ~df[label]]
    non_subgroup_positive_examples = df[~df[subgroup] & df[label]]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label], examples[model_name])

def compute_bnsp_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[df[subgroup] & df[label]]
    non_subgroup_negative_examples = df[~df[subgroup] & ~df[label]]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label], examples[model_name])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)




In [None]:
def calculate_overall_auc(df, model_name):
    true_labels = df['target']
    predicted_labels = df[oof_name]
    return metrics.roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)
    


In [None]:
def build_model(x_train,y_train,x_valid,y_valid,embedding_matrix,patience=3):
    
    early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=patience)
    words = Input(shape=(MAX_LEN,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = GlobalMaxPooling1D()(x)
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='tanh')(hidden)])
    result = Dense(1, activation='sigmoid')(hidden)
    model = Model(inputs=words, outputs=result)
    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=["accuracy"])
    model.fit(x_train,y_train,batch_size=128,epochs=3, validation_data=(x_valid, y_valid), 
                        verbose=2, callbacks=[early_stop])

    return model

In [None]:
# n_fold = 2
# folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=11)

In [None]:
x_train,x_valid,y_train,y_valid=train_test_split(train,y_train,test_size=0.2,random_state=100)

In [None]:
test_tokenized = tokenizer.texts_to_sequences(test['comment_text'])
X_test = sequence.pad_sequences(test_tokenized, maxlen = MAX_LEN)

In [None]:
prediction=np.zeros((len(X_test),1))
print("\n Starting tokenizing")
train_tokenised=tokenizer.texts_to_sequences(x_train['comment_treated'])
valid_tokenised=tokenizer.texts_to_sequences(x_valid['comment_treated'])
print("\n Starting padding")
X_train=sequence.pad_sequences(train_tokenised,maxlen=MAX_LEN)
X_valid=sequence.pad_sequences(valid_tokenised,maxlen=MAX_LEN)
print("\n Building model")
model = build_model(X_train, y_train, X_valid, y_valid,embedding_matrix,patience=3)
gc.collect()
print(model.summary())
print("\n Validation prediction")
pred_valid = model.predict([X_valid])
#valid_df=X_valid.copy()
#valid_df['predicted_target']=pred_valid
#bias_metrics_df = compute_bias_metrics_for_model(pred_valid, identity_columns, oof_name, 'target')
#scores.append(get_final_metric(bias_metrics_df, calculate_overall_auc(pred_valid, oof_name)))
print("\n Test set prediction")
prediction += model.predict(X_test, batch_size = 1024, verbose = 1)

In [None]:
# # valid_checkpoint_predictions = []
# # checkpoint_predictions=[]
# # valid_weights = []
# # weights=[]
# # for model_idx in range(NUM_MODELS):
# #     model = build_model(embedding_matrix, y_aux_train_df.shape[-1])
# #     for global_epoch in range(EPOCHS):
# #         model.fit(
# #             x_train_df,
# #             [y_train_df,y_aux_train_df],
# #             batch_size=BATCH_SIZE,
# #             epochs=1,
# #             verbose=2,
# #             callbacks=[
# #                 LearningRateScheduler(lambda epoch: 1e-3 * (0.6 ** global_epoch))
# #             ]
# #         )
# #         valid_checkpoint_predictions.append(model.predict(x_valid_df, batch_size=2048)[0].flatten())
# #         valid_weights.append(2 ** global_epoch)
# # checkpoint_predictions.append(model.predict(x_test,batch=2048)[0].flatten())


# # predictions = np.average(checkpoint_predictions, axis=0)

# def train_model(X, X_test, y):
    
#     oof = np.zeros((len(X), 1))
#     prediction = np.zeros((len(X_test), 1))
#     scores = []
#     print("\n Started tokenizing for test set")
#     test_tokenized = tokenizer.texts_to_sequences(test['comment_text'])
#     X_test = sequence.pad_sequences(test_tokenized, maxlen = MAX_LEN)
#     print("\n Completed tokenizing and padding.Starting the train-valid epochs.")
#     for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
#         print('Fold', fold_n, 'started at', time.ctime())
#         X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
#         y_train, y_valid = y[train_index], y[valid_index]
#         valid_df = X_valid.copy()    
#         print(f'\n Tokenizing for fold {fold_n}')
#         train_tokenised=tokenizer.texts_to_sequences(X_train['comment_treated'])
#         valid_tokenised=tokenizer.texts_to_sequences(X_valid['comment_treated'])
#         print(f'\n Tokenising completed.Starting padding for fold {fold_n}')
#         X_train=sequence.pad_sequences(train_tokenised,maxlen=MAX_LEN)
#         X_valid=sequence.pad_sequences(valid_tokenised,maxlen=MAX_LEN)
#         print(f'\n Padding completed .Started model building for fold {fold_n}')
#         model = build_model(X_train, y_train, X_valid, y_valid,embedding_matrix,patience=3)
#         print(f'\n Model building completed for fold {fold_n}')
#         pred_valid = model.predict(X_valid)
#         oof[valid_index] = pred_valid
#         valid_df[oof_name] = pred_valid
#         print("Started calculating the bias metric and final metric")
#         bias_metrics_df = compute_bias_metrics_for_model(valid_df, identity_columns, oof_name, 'target')
#         scores.append(get_final_metric(bias_metrics_df, calculate_overall_auc(valid_df, oof_name)))
#         print("Completed finding the bias metric.Started prediction for test set")
#         prediction += model.predict(X_test, batch_size = 1024, verbose = 1)
    
#     prediction /= n_fold
    
#     # print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
#     return oof, prediction, scores

In [None]:
# oof_name = 'predicted_target'
# oof, prediction, scores = train_model(X=train, X_test=test, y=y_train)
# print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))

In [None]:
submission['prediction']=prediction

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
submission.head()