In [89]:
import pandas as pd
import seaborn
from nltk.corpus import stopwords
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer,HashingVectorizer
from sklearn.svm import SVC
import spacy
import string
from spacy.lang.en import English
from nltk.stem.porter import PorterStemmer
spacy.load('en')
parser = English()

In [90]:
STOPWORDS = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS)) 
SYMBOLCHARS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”","''"]

In [91]:
OffensiveLangDF = pd.read_csv('../Data/Offensive Language Dataset/Cleaned_labeled_data.csv')
spamSmsDF = pd.read_csv('../Data/SMS Spam Dataset/Cleaned_SMSSpamCollection.csv')
politicalDF = pd.read_csv('../Data/Indian Political Tweets Dataset/cleaned-tweets.csv')

currentDF = OffensiveLangDF

In [92]:
def tokenizeText(textData):

    textData = textData.strip().replace("\n", " ").replace("\r", " ")
    textData = textData.lower()
    tokens = parser(textData)

    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    
    # Remove Stop Words
    tokens = [tok for tok in tokens if tok.lower() not in STOPWORDS]
    # Remove Symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLCHARS]
    # Remove words with less than 3 characters
    tokens = [tok for tok in tokens if len(tok) >= 3]
    # Remove Non-Alphabetic Characters
    tokens = [tok for tok in tokens if tok.isalpha()]
    
    # Stemming of Words
    porter = PorterStemmer()
    tokens = [porter.stem(word) for word in tokens]
    
    tokens = list(set(tokens))
    textData = ' '.join(tokens[:])
    return textData

In [93]:
# hateSpeechDF['text'] = hateSpeechDF['text'].apply(lambda x:tokenizeText(x))
currentDF['text'] = currentDF['text'].apply(lambda x:tokenizeText(x))

In [94]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(currentDF['text'], currentDF['category'])
print (train_x)

13318                             dyke date feel said hurt
20718    realli yellow fighter dalmatian black white go...
19976                     round team check mock simul pick
2491                                      smack bitch bout
18505                                           stupid hoe
                               ...                        
8938                                 rick bitch jame titti
4279                                                 pussi
20324                                     say goodby trash
4393                                        lol bitch meet
19141                   dri calori wast hell mad honey bun
Name: text, Length: 17514, dtype: object


## Feature Extraction

### Count Vector as Features

In [95]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(currentDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [96]:
xtrain_count.shape

(17514, 14158)

### Word Level TF-IDF

In [97]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(currentDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [98]:
xtrain_tfidf.shape

(17514, 5000)

### N-Gram TF-IDF

In [99]:
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(currentDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

In [100]:
xtrain_tfidf.shape

(17514, 5000)

### Character Level TF-IDF

In [101]:
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(currentDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 



In [102]:
xtrain_tfidf.shape

(17514, 5000)

## Model

In [103]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)

    return metrics.classification_report(predictions, valid_y)

## Training and Classification Reports

### SVM | Count

In [104]:
accuracy = train_model(SVC(kernel='linear',decision_function_shape='ovo'), xtrain_count, train_y, xvalid_count)
print(accuracy)

               precision    recall  f1-score   support

      Neither       0.85      0.86      0.86      1055
OffensiveLang       0.97      0.97      0.97      4784

     accuracy                           0.95      5839
    macro avg       0.91      0.92      0.91      5839
 weighted avg       0.95      0.95      0.95      5839



### SVM | Word level TF-IDF

In [105]:
accuracy = train_model(SVC(kernel='linear',decision_function_shape='ovo'), xtrain_tfidf, train_y, xvalid_tfidf)
print(accuracy)

               precision    recall  f1-score   support

      Neither       0.90      0.85      0.88      1140
OffensiveLang       0.96      0.98      0.97      4699

     accuracy                           0.95      5839
    macro avg       0.93      0.91      0.92      5839
 weighted avg       0.95      0.95      0.95      5839



### SVM | N-Gram level TF-IDF

In [106]:
accuracy = train_model(SVC(kernel='linear',decision_function_shape='ovo'), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print(accuracy)

               precision    recall  f1-score   support

      Neither       0.18      0.83      0.29       226
OffensiveLang       0.99      0.84      0.91      5613

     accuracy                           0.84      5839
    macro avg       0.58      0.84      0.60      5839
 weighted avg       0.96      0.84      0.89      5839



### SVM | Character level TF-IDF

In [107]:
accuracy = train_model(SVC(kernel='linear',decision_function_shape='ovo'), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print(accuracy)

               precision    recall  f1-score   support

      Neither       0.87      0.83      0.85      1123
OffensiveLang       0.96      0.97      0.97      4716

     accuracy                           0.94      5839
    macro avg       0.92      0.90      0.91      5839
 weighted avg       0.94      0.94      0.94      5839

