In [30]:
import pandas as pd
import seaborn
from nltk.corpus import stopwords
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer,HashingVectorizer
from sklearn.pipeline import Pipeline
import spacy
import string
import joblib
from spacy.lang.en import English
from nltk.stem.porter import PorterStemmer
spacy.load('en')
parser = English()

In [2]:
STOPWORDS = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS)) 
SYMBOLCHARS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”","''"]

In [3]:
OffensiveLangDF = pd.read_csv('../Data/Offensive Language Dataset/Cleaned_labeled_data.csv')
spamSmsDF = pd.read_csv('../Data/SMS Spam Dataset/Cleaned_SMSSpamCollection.csv')
politicalDF = pd.read_csv('../Data/Indian Political Tweets Dataset/cleaned-tweets.csv')

currentDF = politicalDF

In [4]:
def tokenizeText(textData):

    textData = textData.strip().replace("\n", " ").replace("\r", " ")
    textData = textData.lower()
    tokens = parser(textData)

    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    
    # Remove Stop Words
    tokens = [tok for tok in tokens if tok.lower() not in STOPWORDS]
    # Remove Symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLCHARS]
    # Remove words with less than 3 characters
    tokens = [tok for tok in tokens if len(tok) >= 3]
    # Remove Non-Alphabetic Characters
    tokens = [tok for tok in tokens if tok.isalpha()]
    
    # Stemming of Words
    porter = PorterStemmer()
    tokens = [porter.stem(word) for word in tokens]
    
    tokens = list(set(tokens))
    textData = ' '.join(tokens[:])
    return textData

In [5]:
# hateSpeechDF['text'] = hateSpeechDF['text'].apply(lambda x:tokenizeText(x))
currentDF['text'] = currentDF['text'].apply(lambda x:tokenizeText(x))

In [6]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(currentDF['text'], currentDF['category'])
print (train_x)

4627    aiadmk nda parti ganga india join corrupt like...
3777    nitish son remarriag quickest rage bjp oath bi...
4924    upa india btw take aspir monument social prote...
3770    sushmaswaraj politician congratul pic visit in...
510                            sim unlock tmo instal card
                              ...                        
1539    time lol fhe jessica good decent emma gossip a...
3029    bjpempowersdalit peopl chat empow today india ...
2604        trade includ itali paologentiloni talk promot
426     sleep confus hour serious clock today turn loo...
3590    announc presid complex dharna tuesday sonia pa...
Name: text, Length: 4545, dtype: object


## Feature Extraction for Linear Classification

### Count Vector as Features

In [7]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(currentDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [8]:
xtrain_count.shape

(4545, 10204)

### Word Level TF-IDF

In [9]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(currentDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [10]:
xtrain_tfidf.shape

(4545, 5000)

### N-Gram TF-IDF

In [11]:
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(currentDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

In [12]:
xtrain_tfidf.shape

(4545, 5000)

### Character Level TF-IDF

In [13]:
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(currentDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 



In [14]:
xtrain_tfidf.shape

(4545, 5000)

## Model

In [15]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)

    return metrics.classification_report(predictions, valid_y)

## Training and Classification Reports

### Linear Classifier | Count

In [16]:
accuracy = train_model(linear_model.LogisticRegression(solver="lbfgs",multi_class="auto",max_iter=4000), xtrain_count, train_y, xvalid_count)
print(accuracy)

              precision    recall  f1-score   support

      NOTPOL       0.98      0.86      0.92       574
         POL       0.92      0.99      0.96       941

    accuracy                           0.94      1515
   macro avg       0.95      0.93      0.94      1515
weighted avg       0.95      0.94      0.94      1515



### Linear Classifier | Word level TF-IDF

In [20]:
accuracy = train_model(linear_model.LogisticRegression(solver="lbfgs",multi_class="auto",max_iter=4000), xtrain_tfidf, train_y, xvalid_tfidf)
print(accuracy)

              precision    recall  f1-score   support

      NOTPOL       0.96      0.90      0.93       540
         POL       0.95      0.98      0.96       975

    accuracy                           0.95      1515
   macro avg       0.96      0.94      0.95      1515
weighted avg       0.95      0.95      0.95      1515



### Linear Classifier | N-Gram level TF-IDF

In [18]:
accuracy = train_model(linear_model.LogisticRegression(solver="lbfgs",multi_class="auto",max_iter=4000), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print(accuracy)

              precision    recall  f1-score   support

      NOTPOL       0.08      0.93      0.15        45
         POL       1.00      0.69      0.81      1470

    accuracy                           0.69      1515
   macro avg       0.54      0.81      0.48      1515
weighted avg       0.97      0.69      0.79      1515



### Linear Classifier | Character level TF-IDF

In [19]:
accuracy = train_model(linear_model.LogisticRegression(solver="lbfgs",multi_class="auto",max_iter=4000), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print(accuracy)

              precision    recall  f1-score   support

      NOTPOL       0.92      0.88      0.90       526
         POL       0.94      0.96      0.95       989

    accuracy                           0.93      1515
   macro avg       0.93      0.92      0.92      1515
weighted avg       0.93      0.93      0.93      1515



In [31]:
def save_model(vectorizer,classifier,train_data,train_labels,test_data,test_label,pickle_name):
    # Initialize Pipeline
    vec_clf = Pipeline([('vectorizer', vectorizer), ('pac', classifier)])
    # fit the training dataset on the classifier
    vec_clf.fit(train_data,train_labels)
    # predict the labels on validation dataset
    predictions = vec_clf.predict(test_data)
    print(metrics.accuracy_score(predictions, test_label))
    # save Pipline
    joblib.dump(vec_clf, pickle_name, compress=True)

In [41]:
save_model(tfidf_vect,linear_model.LogisticRegression(solver="lbfgs",multi_class="auto",max_iter=4000),train_x,train_y,valid_x,valid_y,'../Saved Model/LR_Pol.pkl')

0.9498349834983498


In [48]:
classifier = joblib.load('../Saved Model/LR_Pol.pkl')
tweet = ["What is BJP doing for Farmers", "Hello how are you"]
predict = classifier.predict(tweet)
print(predict)

['POL' 'NOTPOL']
