In [None]:
import numpy as np 
import pandas as pd 
from nltk.corpus import stopwords, wordnet
import string
import re
import xgboost as xgb
import gc
from sklearn import preprocessing, metrics, ensemble, neighbors, linear_model, tree, model_selection
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn import manifold, decomposition, naive_bayes
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from nltk.tokenize import wordpunct_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from functools import lru_cache
from tqdm import tqdm as tqdm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from scipy import sparse

In [None]:
#from subprocess import check_output
#print(check_output(["ls"]).decode("utf8"))

In [None]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [None]:
train_df = train_df.fillna("unknown")
test_df = test_df.fillna("unknown")

In [None]:
print("Number of rows in train dataset : ",train_df.shape[0])
print("Number of rows in test dataset : ",test_df.shape[0])

In [None]:
print("Distribution of toxic : \n", train_df.toxic.value_counts())
print("Distribution of severe_toxic : \n", train_df.severe_toxic.value_counts())
print("Distribution of obscene : \n", train_df.obscene.value_counts())
print("Distribution of threat : \n", train_df.threat.value_counts())
print("Distribution of insult : \n", train_df.insult.value_counts())
print("Distribution of toxic : \n", train_df.identity_hate.value_counts())

In [None]:
#l=train.toxic+train.severe_toxic+train.obscene+train.threat+train.insult+train.identity_hate

In [None]:
#l.value_counts()

In [None]:
train_df['comment_text'][0]

In [None]:
# function to clean the comment
# adapted from a kaggle kernal, can't find its link now
def cleanData(text):
    
    
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"what's", "", text)
    text = re.sub(r"What's", "", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "I am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"60k", " 60000 ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " America ", text)
    text = re.sub(r" USA ", " America ", text)
    text = re.sub(r" u s ", " America ", text)
    text = re.sub(r" uk ", " England ", text)
    text = re.sub(r" UK ", " England ", text)
    text = re.sub(r"india", "India", text)
    text = re.sub(r" dms ", "direct messages ", text)  
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r"KMs", " kilometers ", text)
    text = re.sub(r"\0rs ", " rs ", text) 
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"ios", "operating system", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r"dna", "DNA", text)
    text = re.sub(r"III", "3", text) 
    text = re.sub(r"Find", "find", text) 
    text = re.sub(r"[^A-Za-z0-9]", " ", text)

    return text

In [None]:
train_df['comment_text'] = train_df['comment_text'].apply(lambda x: cleanData(x))
test_df['comment_text'] = test_df['comment_text'].apply(lambda x: cleanData(x))

In [None]:
# define function for stemming and lemmatizing the text
def cleanData2(text, stemming = False, lemmatize=False):
    txt = str(text)
    txt = re.sub(r'-', r'', txt)
    
   
    if stemming:
        st = EnglishStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    
    if lemmatize:
        wordnet_lemmatizer = WordNetLemmatizer()
        txt = " ".join([wordnet_lemmatizer.lemmatize(w) for w in txt.split()])
        
    return txt

In [None]:
train_df['comment_text'] = train_df['comment_text'].map(lambda x: cleanData2(x,  stemming = True, lemmatize=True))
test_df['comment_text'] = test_df['comment_text'].map(lambda x: cleanData2(x,  stemming = True, lemmatize=True))

In [None]:
train_df.to_csv('data/train_cleaned.csv', index=False)
test_df.to_csv('data/test_cleaned.csv', index=False)

In [None]:
(train_df.head())

In [None]:
tfidf_word = TfidfVectorizer()
X_tfidf_word = tfidf_word.fit_transform(train_df['comment_text'])
tfidf_char = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), lowercase=False)
X_tfidf_char = tfidf_char.fit_transform(train_df['comment_text'])
X_tfidf = sparse.hstack([X_tfidf_word, X_tfidf_char])


In [None]:
tfidf_word = TfidfVectorizer()
X1_tfidf_word = tfidf_word.fit_transform(test_df['comment_text'])
tfidf_char = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), lowercase=False)
X1_tfidf_char = tfidf_char.fit_transform(test_df['comment_text'])
X1_tfidf = sparse.hstack([X_tfidf_word, X_tfidf_char])

In [None]:
### Fit transform the tfidf vectorizer ###
tfidf_vec = TfidfVectorizer(analyzer='char',stop_words='english', max_features=8000,, ngram_range=(1,3))
full_tfidf = tfidf_vec.fit_transform(train_df['comment_text'].values.tolist() + test_df['comment_text'].values.tolist())
train_tfidf = tfidf_vec.transform(train_df['comment_text'].values.tolist())
test_tfidf = tfidf_vec.transform(test_df['comment_text'].values.tolist())

In [None]:
transform_com = TfidfVectorizer(stop_words='english', max_features=8000, ngram_range=(1,3)).fit(pd.concat([train['comment_text'],test['comment_text']],axis=0))
comments_train = transform_com.transform(train['comment_text'])
comments_test = transform_com.transform(test['comment_text'])
gc.collect()

In [None]:
n_comp = 80
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(full_tfidf)
train_svd = pd.DataFrame(svd_obj.transform(train_tfidf))
test_svd = pd.DataFrame(svd_obj.transform(test_tfidf))
    
train_svd.columns = ['svd_word_'+str(i) for i in range(n_comp)]
test_svd.columns = ['svd_word_'+str(i) for i in range(n_comp)]
train_df = pd.concat([train_df, train_svd], axis=1)
test_df = pd.concat([test_df, test_svd], axis=1)
del full_tfidf, train_tfidf, test_tfidf, train_svd, test_svd

In [None]:
### Fit transform the count vectorizer ###
tfidf_vec = CountVectorizer(stop_words='english', max_features=5000, ngram_range=(1,2))
tfidf_vec.fit(train_df['comment_text'].values.tolist() + test_df['comment_text'].values.tolist())
train_tfidf = tfidf_vec.transform(train_df['comment_text'].values.tolist())
test_tfidf = tfidf_vec.transform(test_df['comment_text'].values.tolist())

In [None]:
def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, seed_val=0, rounds=5000, dep=6, eta=0.1):
    params = {}
    params["objective"] = "binary:logistic"
    params['eval_metric'] = 'logloss'
    params["eta"] = eta
    params["subsample"] = 0.6
    params["min_child_weight"] = 5
    params["colsample_bytree"] = 0.6
    params["max_depth"] = dep

    params["silent"] = 1
    params["seed"] = seed_val
    #params["max_delta_step"] = 2
    #params["gamma"] = 0.5
    num_rounds = rounds

    plst = list(params.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=100, verbose_eval=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    if feature_names is not None:
        create_feature_map(feature_names)
        model.dump_model('xgbmodel.txt', 'xgb.fmap', with_stats=True)
        importance = model.get_fscore(fmap='xgb.fmap')
        importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)
        imp_df = pd.DataFrame(importance, columns=['feature','fscore'])
        imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum()
        imp_df.to_csv("imp_feat.txt", index=False)

    pred_test_y = model.predict(xgtest, ntree_limit=model.best_ntree_limit)
    pred_test_y2 = model.predict(xgb.DMatrix(test_X2), ntree_limit=model.best_ntree_limit)

    loss = 0
    if test_y is not None:
        loss = metrics.log_loss(test_y, pred_test_y)
        return pred_test_y, loss, pred_test_y2
    else:
        return pred_test_y, loss, pred_test_y2

In [None]:
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((test_df.shape[0], len(col)))

In [None]:
cols_to_drop = ['id', 'comment_text']
train_X = train_df.drop(cols_to_drop+col, axis=1)
test_X = test_df.drop(cols_to_drop, axis=1)

In [None]:
test_X.head()

In [None]:
for model_name in ["XGB1"]:
    avg_score = 0
    for i, j in enumerate(col):
    

        print("Model building..")
        kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        cv_scores = []
        
        pred_test_full = 0
        pred_val_full = np.zeros(train_df.shape[0])
        for dev_index, val_index in kf.split(train_tfidf, train_df[j]):
            dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
            dev_y, val_y = train_df[j][dev_index], train_df[j][val_index]

            if model_name == "XGB1":
                pred_val, loss, pred_test = runXGB(dev_X, dev_y, val_X, val_y, test_tfidf, rounds=5000)#, feature_names=dev_X.columns.tolist())
            elif model_name == "LGB1":
                pred_val, loss, pred_test = runLGB(dev_X, dev_y, val_X, val_y, test_tfidf, rounds=5000)
            pred_val_full[val_index] = pred_val
            pred_test_full = pred_test_full + pred_test
            cv_scores.append(loss)
            print(cv_scores)
        pred_test_full /= 5.
        preds[:,i] = pred_test_full
        avg_score = avg_score + metrics.log_loss(train_df[j], pred_val_full)
        print(metrics.log_loss(train_df[j], pred_val_full))
    print(avg_score/6)
        #out_df = pd.DataFrame({"transaction_id":test_id})
        #out_df["target"] = pred_test_full
        #out_df.to_csv("pred_test_v5_"+model_name+".csv", index=False)

In [None]:
#1500
0.174851923545  0.0307588272292  0.097675168881  0.0134371405698  0.110697548472  0.0331138548092 #sub1
0.172551708056  0.0303350523081  0.096968593905  0.0131761250039  0.109821485807  0.0327623291308 #sub2
0.171068898419  0.0302236932618  0.096428283268  0.0131215505595  0.109139308988  0.0324764278088 #sub3
0.170988788114  0.0302762238162  0.096480449315  0.0131473827986  0.108874484126  0.0324891823683 #sub4
0.170047287624  0.0303037070276  0.096302944380  0.0130279706331  0.108270451596  0.0324051076683 #xgb(6-0.1-0.6-5-0.6).csv

0.132674747535  0.0283703857325  0.073061890770  0.0116233136495  0.085781432600  0.0298326837564

In [None]:
subm = pd.read_csv('data/sample_submission.csv')    
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = col)], axis=1)
submission.to_csv('xgb_stemmed_lemmatized(6-0.1-0.6-5-0.6).csv', index=False)

In [None]:
def runMNB(train_X, train_y, test_X, test_y, test_X2):
    model = naive_bayes.MultinomialNB()
    model.fit(train_X, train_y)
    pred_test_y = model.predict(test_X)
    pred_test_y2 = model.predict(test_X2)
    return pred_test_y, pred_test_y2, model


In [None]:
avg_score = 0
for i, j in enumerate(col):
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros(train_df.shape[0])
    kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    for dev_index, val_index in kf.split(train_tfidf, train_df[j]):
        dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
        dev_y, val_y = train_df[j][dev_index], train_df[j][val_index]
        pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y, val_X, val_y, test_tfidf)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index] = pred_val_y
        cv_scores.append(metrics.log_loss(val_y, pred_val_y))
        print(cv_scores)
    print("Mean cv score : ", np.mean(cv_scores))
    pred_full_test = pred_full_test / 5.
    avg_score = avg_score + metrics.log_loss(train_df[j], pred_train)
print(avg_score/6)    