In [1]:
import pandas as pd
import numpy as np
import pickle

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
# nltk.download('stopwords')
from nltk.tokenize import word_tokenize
# nltk.download('punkt')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, accuracy_score

## Data ingestion and manipulation

In [2]:
twitter = pd.read_csv('Twitter_Data.csv')

In [3]:
twitter.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [4]:
twitter['category'].value_counts()

 1.0    72250
 0.0    55213
-1.0    35510
Name: category, dtype: int64

In [5]:
reddit = pd.read_csv('Reddit_Data.csv')

In [6]:
reddit

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1
...,...,...
37244,jesus,0
37245,kya bhai pure saal chutiya banaya modi aur jab...,1
37246,downvote karna tha par upvote hogaya,0
37247,haha nice,1


In [7]:
reddit['category'].value_counts()

 1    15830
 0    13142
-1     8277
Name: category, dtype: int64

In [8]:
twitter = twitter.rename(columns = {'clean_text' : 'body'})
reddit = reddit.rename(columns = {'clean_comment' : 'body'})

data = pd.concat([twitter, reddit])
data

Unnamed: 0,body,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
37244,jesus,0.0
37245,kya bhai pure saal chutiya banaya modi aur jab...,1.0
37246,downvote karna tha par upvote hogaya,0.0
37247,haha nice,1.0


In [9]:
data.isna().sum()

body        104
category      7
dtype: int64

In [10]:
data.dropna(inplace = True)

## Data Processing

In [11]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def process_text(sentence) :
        
    tokenized = word_tokenize(str(sentence))


    tokenized_lowcase = [word.lower() for word in tokenized]

    filtered_sentence = [word for word in tokenized_lowcase if not word in stop_words and word.isalpha()]

    stemmed = [stemmer.stem(word) for word in filtered_sentence]

    final = ' '.join(stemmed)

    return final

In [12]:
data['body'] = [process_text(sentence) for sentence in data['body']]

In [13]:
data.head()

Unnamed: 0,body,category
0,modi promis minimum govern maximum govern expe...,-1.0
1,talk nonsens continu drama vote modi,0.0
2,say vote modi welcom bjp told rahul main campa...,1.0
3,ask support prefix chowkidar name modi great s...,1.0
4,answer among power world leader today trump pu...,1.0


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    
    data.drop(columns = ['category']),
    data['category'],
    test_size = 0.30,
    random_state = 42
    
)

In [15]:
vectorizer= TfidfVectorizer()

tf_x_train = vectorizer.fit_transform(np.array(X_train).reshape(-1,))
tf_x_test = vectorizer.transform(np.array(X_test).reshape(-1,))

## Training and evaluation of ML models

In [16]:
DTC = DecisionTreeClassifier()
DTC.fit(tf_x_train, y_train)

In [17]:
DTC.score(tf_x_test, y_test)

0.7773669131854221

In [18]:
DTC_pred = DTC.predict(tf_x_test)

In [19]:
print(classification_report(y_test,DTC_pred))

              precision    recall  f1-score   support

        -1.0       0.67      0.63      0.65     13047
         0.0       0.81      0.84      0.82     20431
         1.0       0.80      0.80      0.80     26558

    accuracy                           0.78     60036
   macro avg       0.76      0.76      0.76     60036
weighted avg       0.78      0.78      0.78     60036



In [20]:
LR = LogisticRegression(multi_class='multinomial')

In [21]:
LR.fit(tf_x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
LR.score(tf_x_test, y_test)

0.8459091211939503

In [23]:
LR_pred = DTC.predict(tf_x_test)

In [24]:
print(classification_report(y_test,LR_pred))

              precision    recall  f1-score   support

        -1.0       0.67      0.63      0.65     13047
         0.0       0.81      0.84      0.82     20431
         1.0       0.80      0.80      0.80     26558

    accuracy                           0.78     60036
   macro avg       0.76      0.76      0.76     60036
weighted avg       0.78      0.78      0.78     60036



In [25]:
MNB = MultinomialNB()
MNB.fit(tf_x_train, y_train)

In [26]:
MNB.score(tf_x_test, y_test)

0.565827170364448

In [27]:
MNB_pred = DTC.predict(tf_x_test)

In [28]:
print(classification_report(y_test,MNB_pred))

              precision    recall  f1-score   support

        -1.0       0.67      0.63      0.65     13047
         0.0       0.81      0.84      0.82     20431
         1.0       0.80      0.80      0.80     26558

    accuracy                           0.78     60036
   macro avg       0.76      0.76      0.76     60036
weighted avg       0.78      0.78      0.78     60036



In [29]:
RFC = RandomForestClassifier(n_estimators = 10)
RFC.fit(tf_x_train, y_train)

In [30]:
RFC.score(tf_x_test, y_test)

0.7558464921047372

In [33]:
RFC_pred = DTC.predict(tf_x_test)

In [34]:
print(classification_report(y_test,RFC_pred))

              precision    recall  f1-score   support

        -1.0       0.67      0.63      0.65     13047
         0.0       0.81      0.84      0.82     20431
         1.0       0.80      0.80      0.80     26558

    accuracy                           0.78     60036
   macro avg       0.76      0.76      0.76     60036
weighted avg       0.78      0.78      0.78     60036



## Let's create a function to transform text to a compatible input format

In [35]:
def transform_input(sentence) :
    sentence = process_text(sentence)
    sentence = pd.DataFrame({'body' : [sentence]})
    sentence = vectorizer.transform(np.array(sentence).reshape(-1,))
    return sentence

In [36]:
LR.predict(transform_input('Today was a good day.'))

array([1.])

In [37]:
MNB.predict(transform_input('Today was a good day.'))

array([1.])

In [38]:
DTC.predict(transform_input('Today was a good day.'))

array([1.])

In [39]:
RFC.predict(transform_input('Today was a good day.'))

array([1.])

### Saving the models and the TF-IDF vectorizer for later use in the web app

In [42]:
pickle.dump(vectorizer, open("Models/TFIDF.pickle", "wb"))
pickle.dump(LR, open("Models/LR.pickle", "wb"))
pickle.dump(DTC, open("Models/DTC.pickle", "wb"))
pickle.dump(RFC, open("Models/RFC.pickle", "wb"))