In [1]:
import pandas as pd
from stop_words import get_stop_words
from collections import Counter
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib.gridspec as gridspec 


color = sns.color_palette()
sns.set_style("dark")

#nlp
import string
import re    #for regex
import nltk
from nltk.corpus import stopwords
import spacy
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
# Tweet tokenizer does not split at apostophes which is what we want
from nltk.tokenize import TweetTokenizer   


#FeatureEngineering
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split


tokenizer=TweetTokenizer()
eng_stopwords = set(stopwords.words("english"))
lem = WordNetLemmatizer()

APPO = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}

def clean(comment):
    """
    This function receives comments and returns clean word-list
    """
    #Convert to lower case , so that Hi and hi are the same
    comment=comment.lower()
    #remove \n
    comment=re.sub("\\n"," ",comment)
    # remove leaky elements like ip,user
    comment=re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",comment)
    #removing usernames
    comment=re.sub("\[\[.*\]","",comment)
    
    #Split the sentences into words
    words=tokenizer.tokenize(comment)
    
    # (')aphostophe  replacement (ie)   you're --> you are  
    # ( basic dictionary lookup : master dictionary present in a hidden block of code)
    words=[APPO[word] if word in APPO else word for word in words]
    words=[lem.lemmatize(word, "v") for word in words]
    words = [w for w in words if not w in eng_stopwords]
    
    clean_sent=" ".join(words)
    # remove any non alphanum,digit character
    return(clean_sent)


In [None]:
stop_words = get_stop_words('english')
stop_words_two = [word.upper() for word in stop_words]
stop_words += ["You", "like", "just", "will", "know", "u", "you."]
stop_words += stop_words_two

df = pd.read_csv("train.csv")

total = len(df['comment_text'])
toxic_dict = {}
severe_toxic_dict = {}
obscene_dict = {}
threat_dict = {}
insult_dict = {}
for i in range(total):
    words = df['comment_text'][i].split(" ")
    for word in words:
        if word not in stop_words and word:
            if df['toxic'][i]:
                if word in toxic_dict:
                    toxic_dict[word] += 1
                else:
                    toxic_dict[word] = 1
            if df['severe_toxic'][i]:
                if word in severe_toxic_dict:
                    severe_toxic_dict[word] += 1
                else:
                    severe_toxic_dict[word] = 1
            if df['obscene'][i]:
                if word in obscene_dict:
                    obscene_dict[word] += 1
                else:
                    obscene_dict[word] = 1
            if df['threat'][i]:
                if word in threat_dict:
                    threat_dict[word] += 1
                else:
                    threat_dict[word] = 1
            if df['insult'][i]:
                if word in insult_dict:
                    insult_dict[word] += 1
                else:
                    insult_dict[word] = 1

                    
names = ["toxic","severe_toxic", "obscene", "threat", "insult"]
dictionaries = [toxic_dict, severe_toxic_dict, obscene_dict, threat_dict, insult_dict]
counter = 0
for dicts in dictionaries:
    print(names[counter])
    counter += 1
    d = Counter(dicts)
    for k, v in d.most_common(10):
        print("{0}: {1}".format(k, v))
        
    

lengths = df.comment_text.str.len()
lengths.hist()
lengths.mean(), lengths.std(), lengths.max(), lengths.min()

In [None]:
from wordcloud import WordCloud

df = pd.read_csv("train.csv")
comments = df["comment_text"].copy()
comments = comments.apply(lambda x: clean(x))
df.to_csv(r'comments.txt', header=None, index=None, sep=' ', mode='a')


text = open("comments.txt").read()

wordcloud = WordCloud().generate(text)

import matplotlib.pyplot as plt
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

wordcloud = WordCloud(max_font_size=40).generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
df = pd.read_csv("train.csv")


rowsums=df.iloc[:,2:].sum(axis=1)


x=rowsums.value_counts()

#plot
plt.figure(figsize=(8,4))
ax = sns.barplot(x.index, x.values, alpha=0.8,color=color[0])
plt.title("Tags per comment")
plt.ylabel('Number of Comments', fontsize=12)
plt.xlabel('Number of Tags ', fontsize=12)

#adding the text labels
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')


plt.show()


train = pd.read_csv("train.csv")

temp_df=train.iloc[:,2:-1]

corr=temp_df.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values, annot=True)

In [None]:
df = pd.read_csv("train.csv")

#serperate train and test features
train_feats=df.iloc[0:len(train),]
#join the tags
train_tags=train.iloc[:,2:]
train_feats=pd.concat([train_feats,train_tags],axis=1)


clean_corpus=train["comment_text"].apply(lambda x :clean(x))


tfv = TfidfVectorizer(min_df=200,  max_features=10000, 
            strip_accents='unicode', analyzer='word',ngram_range=(1,1),
            use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
tfv.fit(clean_corpus)
features = np.array(tfv.get_feature_names())

train_unigrams =  tfv.transform(clean_corpus.iloc[:train.shape[0]])



#https://buhrmann.github.io/tfidf-analysis.html
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

def top_mean_feats(Xtr, features, grp_ids, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    
    D = Xtr[grp_ids].toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

# modified for multilabel milticlass
def top_feats_by_class(Xtr, features, min_tfidf=0.1, top_n=20):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []
    cols=train_tags.columns
    for col in cols:
        ids = train_tags.index[train_tags[col]==1]
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs


#get top n for unigrams
tfidf_top_n_per_lass=top_feats_by_class(train_unigrams,features)


plt.figure(figsize=(16,22))
plt.suptitle("TF_IDF Top words per class(unigrams)",fontsize=20)
gridspec.GridSpec(4,2)
plt.subplot2grid((4,2),(0,0))
sns.barplot(tfidf_top_n_per_lass[0].feature.iloc[0:9],tfidf_top_n_per_lass[0].tfidf.iloc[0:9],color=color[0])
plt.title("Toxic Comments",fontsize=15)
plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF score', fontsize=12)

plt.subplot2grid((4,2),(0,1))
sns.barplot(tfidf_top_n_per_lass[1].feature.iloc[0:9],tfidf_top_n_per_lass[1].tfidf.iloc[0:9],color=color[1])
plt.title("Severely toxic Comments",fontsize=15)
plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF score', fontsize=12)


plt.subplot2grid((4,2),(1,0))
sns.barplot(tfidf_top_n_per_lass[2].feature.iloc[0:9],tfidf_top_n_per_lass[2].tfidf.iloc[0:9],color=color[2])
plt.title("Obscene Comments",fontsize=15)
plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF score', fontsize=12)


plt.subplot2grid((4,2),(1,1))
sns.barplot(tfidf_top_n_per_lass[3].feature.iloc[0:9],tfidf_top_n_per_lass[3].tfidf.iloc[0:9],color=color[3])
plt.title("Threat Comments",fontsize=15)
plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF score', fontsize=12)


plt.subplot2grid((4,2),(2,0))
sns.barplot(tfidf_top_n_per_lass[4].feature.iloc[0:9],tfidf_top_n_per_lass[4].tfidf.iloc[0:9],color=color[4])
plt.title("Insulting Comments",fontsize=15)
plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF score', fontsize=12)


plt.subplot2grid((4,2),(2,1))
sns.barplot(tfidf_top_n_per_lass[5].feature.iloc[0:9],tfidf_top_n_per_lass[5].tfidf.iloc[0:9],color=color[5])
plt.title("Identity Hate Comments",fontsize=15)
plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF score', fontsize=12)

plt.show()

In [None]:
#MultinomialNB base line
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, matthews_corrcoef, confusion_matrix
from sklearn.naive_bayes import MultinomialNB



def train(col):
    df = pd.read_csv("train.csv")
    y_train = df[col].copy()
    X_train = df["comment_text"].copy()
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    train_x, test_x, train_y, test_y = train_test_split(X_train_tfidf, y_train, test_size=0.33, random_state=42)
    clf = MultinomialNB().fit(train_x, train_y)
    y_pred = clf.predict(test_x)
    not_col = "not " + col
    tn, fp, fn, tp = confusion_matrix(test_y, y_pred).ravel()
    print("-----Specificity---")
    print(tn, fp, fn, tp)
    print("----Matthew Correlation Coeffecient---")
    print(matthews_corrcoef(test_y, y_pred))
    print("---Precision and Recall---")
    print(classification_report(test_y, y_pred, target_names=[col, not_col]))

total = 0
cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

for col in cols:
    train(col)



In [None]:
#Multinomial Naive Bayes (Data Preperation)

import requests
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from scipy.sparse import hstack
import scipy
from sklearn.metrics import accuracy_score, classification_report, matthews_corrcoef, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression


def train_clean(col, train_features, y_train):    
    train_x, test_x, train_y, test_y = train_test_split(train_features, y_train, test_size=0.33, random_state=42)
    clf = MultinomialNB().fit(train_x, train_y)
    y_pred = clf.predict(test_x)
    not_col = "not " + col
    tn, fp, fn, tp = confusion_matrix(test_y, y_pred).ravel()
    print("-----Specificity---")
    print(tn, fp, fn, tp)
    print("----Matthew Correlation Coeffecient---")
    print(matthews_corrcoef(test_y, y_pred))
    print("---Precision and Recall---")
    print(classification_report(test_y, y_pred, target_names=[col, not_col]))
    
df = pd.read_csv("train.csv")
X_train = df["comment_text"].copy()
X_train = X_train.apply(lambda x : clean(x))
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(X_train)
train_word_features = word_vectorizer.transform(X_train)

train_upper = X_train.apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
train_unique = X_train.apply(lambda x: len(set(str(x).split())))
train_punc = X_train.apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
train_stop = X_train.apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
bad_words_list = requests.get("https://gist.githubusercontent.com/ryanlewis/a37739d710ccdb4b406d/raw/0fbd315eb2900bb736609ea894b9bde8217b991a/google_twunter_lol").text
train_bad_words = X_train.apply(lambda x: len([w for w in str(x).split() if w in bad_words_list]))

word_features = pd.DataFrame(train_word_features.toarray())

dataframes = [word_features, train_upper, train_unique, train_punc, train_stop, train_bad_words]
train_features = pd.concat(dataframes, axis=1)

cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
for col in cols:
    print('fit ', col)
    y_train = df[col].copy()
    train_clean(col, train_features, y_train)

In [None]:
#Logistic Regression (Data Preperation)

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from scipy.sparse import hstack
from sklearn.metrics import accuracy_score, classification_report, matthews_corrcoef, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression


def train_clean(col, train_features, y_train):    
    train_x, test_x, train_y, test_y = train_test_split(train_features, y_train, test_size=0.33, random_state=42)
    clf = LogisticRegression(C=0.1, solver='sag').fit(train_x, train_y)
    y_pred = clf.predict(test_x)
    not_col = "not " + col
    tn, fp, fn, tp = confusion_matrix(test_y, y_pred).ravel()
    print("-----Specificity---")
    print(tn, fp, fn, tp)
    print("----Matthew Correlation Coeffecient---")
    print(matthews_corrcoef(test_y, y_pred))
    print("---Precision and Recall---")
    print(classification_report(test_y, y_pred, target_names=[col, not_col]))
    

df = pd.read_csv("train.csv")
X_train = df["comment_text"].copy()
X_train = X_train.apply(lambda x : clean(x))
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(X_train)
train_word_features = word_vectorizer.transform(X_train)

train_upper = X_train.apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
train_unique = X_train.apply(lambda x: len(set(str(x).split())))
train_punc = X_train.apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
train_stop = X_train.apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
bad_words_list = requests.get("https://gist.githubusercontent.com/ryanlewis/a37739d710ccdb4b406d/raw/0fbd315eb2900bb736609ea894b9bde8217b991a/google_twunter_lol").text
train_bad_words = X_train.apply(lambda x: len([w for w in str(x).split() if w in bad_words_list]))

word_features = pd.DataFrame(train_word_features.toarray())

dataframes = [word_features, train_upper, train_unique, train_punc, train_stop, train_bad_words]
train_features = pd.concat(dataframes, axis=1)


cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
for col in cols:
    print('fit ', col)
    y_train = df[col].copy()
    train_clean(col, train_features, y_train)

In [None]:
#SGDClassifier (Data Preperation)

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from scipy.sparse import hstack
from sklearn.metrics import accuracy_score, classification_report, matthews_corrcoef, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

def train_clean(col, train_features, y_train):    
    train_x, test_x, train_y, test_y = train_test_split(train_features, y_train, test_size=0.33, random_state=42)
    clf = SGDClassifier(loss='hinge', penalty='l2',  alpha=1e-3, n_iter=5, random_state=42).fit(train_x, train_y)
    y_pred = clf.predict(test_x)
    not_col = "not " + col
    tn, fp, fn, tp = confusion_matrix(test_y, y_pred).ravel()
    print("-----Specificity---")
    print(tn, fp, fn, tp)
    print("----Matthew Correlation Coeffecient---")
    print(matthews_corrcoef(test_y, y_pred))
    print("---Precision and Recall---")
    print(classification_report(test_y, y_pred, target_names=[col, not_col]))
    

df = pd.read_csv("train.csv")
X_train = df["comment_text"].copy()
X_train = X_train.apply(lambda x : clean(x))
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(X_train)
train_word_features = word_vectorizer.transform(X_train)

train_upper = X_train.apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
train_unique = X_train.apply(lambda x: len(set(str(x).split())))
train_punc = X_train.apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
train_stop = X_train.apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
bad_words_list = requests.get("https://gist.githubusercontent.com/ryanlewis/a37739d710ccdb4b406d/raw/0fbd315eb2900bb736609ea894b9bde8217b991a/google_twunter_lol").text
train_bad_words = X_train.apply(lambda x: len([w for w in str(x).split() if w in bad_words_list]))

word_features = pd.DataFrame(train_word_features.toarray())

dataframes = [word_features, train_upper, train_unique, train_punc, train_stop, train_bad_words]
train_features = pd.concat(dataframes, axis=1)

cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
for col in cols:
    print('fit ', col)
    y_train = df[col].copy()
    train_clean(col, train_features, y_train)