In [17]:
import pandas as pd
import seaborn
from nltk.corpus import stopwords
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer,HashingVectorizer
from sklearn import decomposition, ensemble
import spacy
import string
from spacy.lang.en import English
from nltk.stem.porter import PorterStemmer
spacy.load('en')
parser = English()

In [18]:
STOPWORDS = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS)) 
SYMBOLCHARS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”","''"]

In [19]:
OffensiveLangDF = pd.read_csv('../Data/Offensive Language Dataset/Cleaned_labeled_data.csv')
spamSmsDF = pd.read_csv('../Data/SMS Spam Dataset/Cleaned_SMSSpamCollection.csv')
politicalDF = pd.read_csv('../Data/Indian Political Tweets Dataset/cleaned-tweets.csv')

currentDF = politicalDF

In [20]:
def tokenizeText(textData):

    textData = textData.strip().replace("\n", " ").replace("\r", " ")
    textData = textData.lower()
    tokens = parser(textData)

    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    
    # Remove Stop Words
    tokens = [tok for tok in tokens if tok.lower() not in STOPWORDS]
    # Remove Symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLCHARS]
    # Remove words with less than 3 characters
    tokens = [tok for tok in tokens if len(tok) >= 3]
    # Remove Non-Alphabetic Characters
    tokens = [tok for tok in tokens if tok.isalpha()]
    
    # Stemming of Words
    porter = PorterStemmer()
    tokens = [porter.stem(word) for word in tokens]
    
    tokens = list(set(tokens))
    textData = ' '.join(tokens[:])
    return textData

In [21]:
# hateSpeechDF['text'] = hateSpeechDF['text'].apply(lambda x:tokenizeText(x))
currentDF['text'] = currentDF['text'].apply(lambda x:tokenizeText(x))

In [22]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(currentDF['text'], currentDF['category'])
print (train_x)

5849    tri follow rule mediev establish india mughal ...
1351    info katt mistakenli email hey dvd sent willia...
5963     capabl type npp embark lftrthat india netherland
2734              medal congratul athlet asian team india
4551    want cong bhai ruthless goon congress modi fin...
                              ...                        
5891    need drama act sasi aiadmk anymor sasikala aia...
4788    truth polit india sy waqar murder brought ideo...
3184    govt chang incindia knowingli uav isra million...
2178                            merg nepal indian bengali
5585              black crore lakh reveal demonetis money
Name: text, Length: 4545, dtype: object


## Feature Extraction

### Count Vector as Features

In [23]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(currentDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [24]:
xtrain_count.shape

(4545, 10204)

### Word Level TF-IDF

In [25]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(currentDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [26]:
xtrain_tfidf.shape

(4545, 5000)

### N-Gram TF-IDF

In [27]:
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(currentDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

In [28]:
xtrain_tfidf.shape

(4545, 5000)

### Character Level TF-IDF

In [29]:
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(currentDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 



In [30]:
xtrain_tfidf.shape

(4545, 5000)

## Model

In [31]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)

    return metrics.classification_report(predictions, valid_y)

## Training and Classification Reports

### Random Forest | Count

In [36]:
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=10), xtrain_count, train_y, xvalid_count)
print(accuracy)

              precision    recall  f1-score   support

      NOTPOL       0.93      0.89      0.91       539
         POL       0.94      0.96      0.95       976

    accuracy                           0.94      1515
   macro avg       0.94      0.93      0.93      1515
weighted avg       0.94      0.94      0.94      1515



### Random Forest | Word level TF-IDF

In [37]:
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=10), xtrain_tfidf, train_y, xvalid_tfidf)
print(accuracy)

              precision    recall  f1-score   support

      NOTPOL       0.99      0.89      0.93       575
         POL       0.93      0.99      0.96       940

    accuracy                           0.95      1515
   macro avg       0.96      0.94      0.95      1515
weighted avg       0.95      0.95      0.95      1515



### Random Forest | N-Gram level TF-IDF

In [38]:
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=10), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print(accuracy)

              precision    recall  f1-score   support

      NOTPOL       1.00      0.62      0.76       837
         POL       0.68      1.00      0.81       678

    accuracy                           0.79      1515
   macro avg       0.84      0.81      0.79      1515
weighted avg       0.86      0.79      0.78      1515



### Random Forest | Character level TF-IDF

In [39]:
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=10), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print(accuracy)

              precision    recall  f1-score   support

      NOTPOL       0.95      0.86      0.90       573
         POL       0.92      0.97      0.94       942

    accuracy                           0.93      1515
   macro avg       0.93      0.91      0.92      1515
weighted avg       0.93      0.93      0.93      1515

