In [106]:
import pandas as pd
import seaborn
from nltk.corpus import stopwords
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer,HashingVectorizer
import spacy
import string
from spacy.lang.en import English
from nltk.stem.porter import PorterStemmer
spacy.load('en')
parser = English()

In [107]:
STOPWORDS = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS)) 
SYMBOLCHARS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”","''"]

In [108]:
OffensiveLangDF = pd.read_csv('../Data/Offensive Language Dataset/Cleaned_labeled_data.csv')
spamSmsDF = pd.read_csv('../Data/SMS Spam Dataset/Cleaned_SMSSpamCollection.csv')
politicalDF = pd.read_csv('../Data/Indian Political Tweets Dataset/cleaned-tweets.csv')

currentDF = politicalDF

In [109]:
def tokenizeText(textData):

    textData = textData.strip().replace("\n", " ").replace("\r", " ")
    textData = textData.lower()
    tokens = parser(textData)

    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    
    # Remove Stop Words
    tokens = [tok for tok in tokens if tok.lower() not in STOPWORDS]
    # Remove Symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLCHARS]
    # Remove words with less than 3 characters
    tokens = [tok for tok in tokens if len(tok) >= 3]
    # Remove Non-Alphabetic Characters
    tokens = [tok for tok in tokens if tok.isalpha()]
    
    # Stemming of Words
    porter = PorterStemmer()
    tokens = [porter.stem(word) for word in tokens]
    
    tokens = list(set(tokens))
    textData = ' '.join(tokens[:])
    return textData

In [110]:
# hateSpeechDF['text'] = hateSpeechDF['text'].apply(lambda x:tokenizeText(x))
currentDF['text'] = currentDF['text'].apply(lambda x:tokenizeText(x))

In [111]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(currentDF['text'], currentDF['category'])
print (train_x)

1179                                                     
3857    token complet gift india nehru peac minist kab...
1241    littl googl mayb work understand voic english ...
4080    congress read thought age save india need narr...
2143    fan india guy hyderabad superb everyday trend ...
                              ...                        
561                 tell movi wanna think tag hello today
393                                                      
2192            landmark relat daniel visit import carmon
4536    safe akhilesh india samajwadi border yadav pol...
1190         insur trade organ free critic membership ill
Name: text, Length: 4545, dtype: object


## Feature Extraction

### Count Vector as Features

In [112]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(currentDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [113]:
xtrain_count.shape

(4545, 10204)

### Word Level TF-IDF

In [114]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(currentDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [115]:
xtrain_tfidf.shape

(4545, 5000)

### N-Gram TF-IDF

In [116]:
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(currentDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

In [117]:
xtrain_tfidf.shape

(4545, 5000)

### Character Level TF-IDF

In [118]:
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(currentDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 



In [119]:
xtrain_tfidf.shape

(4545, 5000)

## Model

In [120]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)

    return metrics.classification_report(predictions, valid_y)

## Training and Classification Reports

### Naive Bayes | Count

In [121]:
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print(accuracy)

              precision    recall  f1-score   support

      NOTPOL       0.84      0.97      0.90       453
         POL       0.99      0.92      0.95      1062

    accuracy                           0.94      1515
   macro avg       0.92      0.95      0.93      1515
weighted avg       0.94      0.94      0.94      1515



### Naive Bayes | Word level TF-IDF

In [122]:
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print(accuracy)

              precision    recall  f1-score   support

      NOTPOL       0.70      0.98      0.82       371
         POL       0.99      0.86      0.92      1144

    accuracy                           0.89      1515
   macro avg       0.85      0.92      0.87      1515
weighted avg       0.92      0.89      0.90      1515



### Naive Bayes | N-Gram level TF-IDF

In [123]:
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print(accuracy)

              precision    recall  f1-score   support

      NOTPOL       0.17      0.64      0.26       137
         POL       0.95      0.69      0.80      1378

    accuracy                           0.68      1515
   macro avg       0.56      0.66      0.53      1515
weighted avg       0.88      0.68      0.75      1515



### Naive Bayes | Character level TF-IDF

In [124]:
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print(accuracy)

              precision    recall  f1-score   support

      NOTPOL       0.69      0.98      0.81       364
         POL       0.99      0.86      0.92      1151

    accuracy                           0.89      1515
   macro avg       0.84      0.92      0.86      1515
weighted avg       0.92      0.89      0.89      1515

