
## Original GitHub Repo is [here](https://github.com/t-davidson/hate-speech-and-offensive-language).

In [1]:
# Dependencies
import pandas as pd
import numpy as np
import pickle
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.porter import *
import string
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
from textstat.textstat import *
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt
import seaborn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
%matplotlib inline

## load data

In [2]:
df = pd.read_csv("data/labeled_data-Davidson.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [4]:
df = df.drop(['Unnamed: 0', 'count','hate_speech','offensive_language','neither'], axis=1)

In [5]:
df.columns = ['label', 'comment_text']

In [6]:
df = df[['comment_text','label']]

In [7]:
def f2(row):
    if row['label'] == 1 or row['label'] == 0 :
        val = 1
    else:
        val = 0
    return val

In [8]:
df['label'] = df.apply(f2, axis=1)

In [9]:
df2 = pd.read_csv("data/jigsaw/train.csv")

In [10]:
def f(row):
    if row['toxic'] == 1 or row['severe_toxic'] == 1 or row['obscene'] == 1 or row['threat'] == 1 or row['insult'] == 1 or row['identity_hate'] == 1:
        val = 1
    else:
        val = 0
    return val

In [11]:
df2['label'] = df2.apply(f, axis=1)

In [12]:
df2 = df2.drop(['id','toxic','severe_toxic','obscene','threat','insult','identity_hate'], axis=1)

In [13]:
df_combine = pd.concat([df,df2], axis=0)

In [14]:
df_combine.groupby('label').size()

label
0    147509
1     36845
dtype: int64

### Columns key:

    0 - normal
    1 - toxic

In [15]:
df3 = df_combine[df_combine['label'] == 0]
df4 = df_combine[df_combine['label'] == 1]

df3 =df3.sample(n=40000)

In [16]:
df_combine = pd.concat([df3,df4], axis=0)

In [17]:
df_combine.groupby('label').size()

label
0    40000
1    36845
dtype: int64

In [18]:
tweets=df_combine.comment_text

## Feature Generation

In [19]:
# import nltk
# nltk.download('stopwords')

In [20]:
stopwords = nltk.corpus.stopwords.words("english")

other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)

stemmer = PorterStemmer()


def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    return parsed_text

def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
    tokens = [stemmer.stem(t) for t in tweet.split()]
    return tokens

def basic_tokenize(tweet):
    """Same as tokenize but without the stemming"""
    tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
    return tweet.split()

vectorizer = TfidfVectorizer(
    tokenizer=tokenize,
    preprocessor=preprocess,
    ngram_range=(1, 3),
    stop_words=stopwords,
    use_idf=True,
    smooth_idf=False,
    norm=None,
    decode_error='replace',
    max_features=10000,
    min_df=5,
    max_df=0.75
    )

In [21]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [22]:
#Construct tfidf matrix and get relevant scores
tfidf = vectorizer.fit_transform(tweets).toarray()
vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}
idf_vals = vectorizer.idf_
idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores

In [23]:
# nltk.download('averaged_perceptron_tagger')

In [24]:
#Get POS tags for tweets and save as a string
tweet_tags = []
for t in tweets:
    tokens = basic_tokenize(preprocess(t))
    tags = nltk.pos_tag(tokens)
    tag_list = [x[1] for x in tags]
    tag_str = " ".join(tag_list)
    tweet_tags.append(tag_str)

In [25]:
#We can use the TFIDF vectorizer to get a token matrix for the POS tags
pos_vectorizer = TfidfVectorizer(
    tokenizer=None,
    lowercase=False,
    preprocessor=None,
    ngram_range=(1, 3),
    stop_words=None,
    use_idf=False,
    smooth_idf=False,
    norm=None,
    decode_error='replace',
    max_features=5000,
    min_df=5,
    max_df=0.75,
    )

In [26]:
#Construct POS TF matrix and get vocab dict
pos = pos_vectorizer.fit_transform(pd.Series(tweet_tags)).toarray()
pos_vocab = {v:i for i, v in enumerate(pos_vectorizer.get_feature_names())}

In [27]:
#Now get other features
sentiment_analyzer = VS()

def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

def other_features(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    sentiment = sentiment_analyzer.polarity_scores(tweet)
    
    words = preprocess(tweet) #Get text only
    
    syllables = textstat.syllable_count(words)
    num_chars = sum(len(w) for w in words)
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    twitter_objs = count_twitter_objs(tweet)
    retweet = 0
    if "rt" in words:
        retweet = 1
    features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],
                twitter_objs[2], twitter_objs[1],
                twitter_objs[0], retweet]
    #features = pandas.DataFrame(features)
    return features

def get_feature_array(tweets):
    feats=[]
    for t in tweets:
        feats.append(other_features(t))
    return np.array(feats)

In [28]:
other_features_names = ["FKRA", "FRE","num_syllables", "avg_syl_per_word", "num_chars", "num_chars_total", \
                        "num_terms", "num_words", "num_unique_words", "vader neg","vader pos","vader neu", \
                        "vader compound", "num_hashtags", "num_mentions", "num_urls", "is_retweet"]

In [29]:
feats = get_feature_array(tweets)

In [30]:
#Now join them all up
M = np.concatenate([tfidf,pos,feats],axis=1)

In [31]:
M.shape

(76845, 5863)

In [37]:
# save and load feature numpy array
# np.save('feature01.npy', M)
# M = np.load('feature01.npy')

In [38]:
#Finally get a list of variable names
variables = ['']*len(vocab)
for k,v in vocab.items():
    variables[v] = k

pos_variables = ['']*len(pos_vocab)
for k,v in pos_vocab.items():
    pos_variables[v] = k

feature_names = variables+pos_variables+other_features_names

# Running the model

The best model was selected using a GridSearch with 5-fold CV.

In [39]:
X = pd.DataFrame(M)
y = df_combine['label'].astype(int)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.1)

In [42]:
pipe = Pipeline(
        [('select', SelectFromModel(LogisticRegression(class_weight='balanced',
                                                  penalty="l1", C=0.01))),
        ('model', LogisticRegression(class_weight='balanced',penalty='l2'))])

In [43]:
param_grid = [{}] # Optionally add parameters here

In [44]:
grid_search = GridSearchCV(pipe, 
                           param_grid,
                           cv=StratifiedKFold(n_splits=5, 
                                              random_state=42).split(X_train, y_train), 
                           verbose=2)

In [45]:
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................
[CV] ................................................. , total=  44.8s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   47.2s remaining:    0.0s


[CV] ................................................. , total=  39.8s
[CV]  ................................................................
[CV] ................................................. , total=  36.2s
[CV]  ................................................................
[CV] ................................................. , total=  34.4s
[CV]  ................................................................
[CV] ................................................. , total=  36.2s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.4min finished


In [46]:
y_preds = model.predict(X_test)

## Evaluating the results

In [47]:
report = classification_report( y_test, y_preds )

In [48]:
print(report)

             precision    recall  f1-score   support

          0       0.87      0.88      0.88      3975
          1       0.87      0.86      0.87      3710

avg / total       0.87      0.87      0.87      7685



In [68]:
# save and load model
from sklearn.externals import joblib
joblib.dump(model.best_estimator_, 'feature01-model01.pkl')
# model = joblib.load(open('feature01-model01.pkl', 'rb'))

['feature01-model01.pkl']

## run another model

In [69]:
# define a function to shuffle split train and test dataset
def shuffle(matrix, target, test_proportion):
    ratio = int(matrix.shape[0]/test_proportion)
    X_train = matrix[ratio:,:]
    X_test =  matrix[:ratio,:]
    Y_train = target[ratio:,:]
    Y_test =  target[:ratio,:]
    return X_train, X_test, Y_train, Y_test

In [93]:
from scipy import sparse

In [94]:
tf=sparse.csr_matrix(M)
labels = df_combine['label'].astype(int)
labels = np.asarray(labels)
labels = np.reshape(labels, (76845,1))

In [95]:
X_train, X_test, Y_train, Y_test = shuffle(tf, labels,3)

In [96]:
print(X_test.shape)
print(X_train.shape)
print(X_train)

(25615, 5863)
(51230, 5863)
  (0, 0)	3.685087301158061
  (0, 9)	4.700989705871539
  (0, 37)	3.610900860909653
  (0, 39)	3.210349569269059
  (0, 42)	4.753448339865456
  (0, 92)	3.439682879662355
  (0, 94)	6.807127974519619
  (0, 142)	4.637858930234064
  (0, 143)	5.849288239732592
  (0, 151)	4.775908576545206
  (0, 317)	2.95716460272902
  (0, 320)	4.267812398349527
  (0, 334)	3.830175467977516
  (0, 340)	3.270381036076704
  (0, 350)	3.879461358663387
  (0, 353)	3.7176609448821853
  (0, 368)	8.378344674133523
  (0, 389)	10.50777512320044
  (0, 392)	11.13391364742
  (0, 395)	3.873685669741819
  (0, 399)	3.121717734639694
  (0, 440)	6.292559328513397
  (0, 443)	5.247315739231105
  (0, 449)	3.396737767418091
  (0, 459)	5.935715527064706
  :	:
  (51229, 5480)	1.0
  (51229, 5483)	1.0
  (51229, 5631)	1.0
  (51229, 5636)	1.0
  (51229, 5640)	1.0
  (51229, 5673)	2.0
  (51229, 5677)	2.0
  (51229, 5692)	4.0
  (51229, 5695)	1.0
  (51229, 5696)	3.0
  (51229, 5708)	1.0
  (51229, 5773)	1.0
  (51229, 584

In [85]:
# Let us define all the evaluation metrics
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

def evaluate_score(Y_test,predict): 
    loss = hamming_loss(Y_test,predict)
    print("Hamming_loss : {}".format(loss*100))
    accuracy = accuracy_score(Y_test,predict)
    print("Accuracy : {}".format(accuracy*100))

In [89]:
# evaluate results of first model
evaluate_score(y_test, y_preds)

Hamming_loss : 12.908262849707222
Accuracy : 87.09173715029279


### 1. Binary Relevance (BR) Method with MultinomialNB classifiers (from scratch)

In [86]:
from sklearn.naive_bayes import MultinomialNB

In [97]:
# clf will be the list of the classifiers for all the labels
# each classifier is fit with the training data and corresponding classifier
clf = []
for ix in range(1):
    clf.append(MultinomialNB())
    clf[ix].fit(X_train,Y_train[:,ix])

ValueError: Input X must be non-negative

## 2. BR Method with SVM classifier (from scikit-multilearn)

In [100]:
#create and fit classifier
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.svm import SVC

In [None]:
classifier = BinaryRelevance(classifier = SVC(), require_dense = [False, True])
classifier.fit(X_train, Y_train)