In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy

In [12]:
#Extract twitter data
data1 = pd.read_csv('datasets/Reddit_Data.csv', nrows=15000)
data1.rename(columns = {'clean_comment': 'text'}, inplace = True)
print(data1.head())
print(len(data1))
data2 = pd.read_csv('datasets/Twitter_Data.csv', nrows=15000)
data2.rename(columns = {'clean_text': 'text'}, inplace = True)
print(data2.head())
print(len(data2))
#Combine both dataframes into one master dataframe
data = pd.concat([data1, data2], ignore_index = True)
print(data.head())
print(len(data))

                                                text  category
0   family mormon have never tried explain them t...         1
1  buddhism has very much lot compatible with chr...         1
2  seriously don say thing first all they won get...        -1
3  what you have learned yours and only yours wha...         0
4  for your own benefit you may want read living ...         1
15000
                                                text  category
0  when modi promised “minimum government maximum...        -1
1  talk all the nonsense and continue all the dra...         0
2  what did just say vote for modi  welcome bjp t...         1
3  asking his supporters prefix chowkidar their n...         1
4  answer who among these the most powerful world...         1
15000
                                                text  category
0   family mormon have never tried explain them t...         1
1  buddhism has very much lot compatible with chr...         1
2  seriously don say thing first all they w

In [13]:
# Preprocessing

## Drop missing value
data = data.dropna(axis = 0, how ='any')
len(data)

29966

In [14]:
data['category'].value_counts()

 1    12882
 0    10137
-1     6947
Name: category, dtype: int64

In [15]:
from spacy.lang.en.stop_words import STOP_WORDS
from collections import Counter
import string

# Lemmatization
def lemmetization(doc):
    words = []
    for word in doc:
        if word.lemma_ != "-PRON-":
            temp = word.lemma_.lower().strip()
        else:
            temp = word.lower_

        words.append(temp)

    return words

# Remove punctuation and stopwords
def punc_removal(doc):
    words = []
    for word in doc:
        if word not in list(STOP_WORDS) and word not in string.punctuation:
            words.append(word)
    
    return words

save_words = []
def text_cleaning_english(text,mynlp):

    doc = mynlp(text)
    # lemmetization
    words = lemmetization(doc)
    # remove punctuation
    words = punc_removal(words)

    # save all words in the data file
    for word in words:
        save_words.append(word)

    # traverse in the string     
    complete_sentence = ' '.join([str(word) for word in words])

    return complete_sentence

def most_used_words():
    list = []
    common_words = Counter(save_words).most_common(5)
    for key, value in common_words:
        list.append({'category': key, 'value': value})

    return list

In [16]:
nlp = spacy.load("en_core_web_sm")
text_cleaning = lambda x: text_cleaning_english(x,mynlp=nlp)
data['cleaned_text'] = pd.DataFrame(data['text'].apply(text_cleaning))

common_words = most_used_words()
print(common_words)

[{'category': 'modi', 'value': 16964}, {'category': 'india', 'value': 4835}, {'category': 'people', 'value': 4146}, {'category': 'bjp', 'value': 3627}, {'category': 'like', 'value': 3573}]


In [17]:
data

Unnamed: 0,text,category,cleaned_text
0,family mormon have never tried explain them t...,1,family mormon try explain stare puzzle time ti...
1,buddhism has very much lot compatible with chr...,1,buddhism lot compatible christianity especiall...
2,seriously don say thing first all they won get...,-1,seriously don thing win complex explain normal...
3,what you have learned yours and only yours wha...,0,learn want teach different focus goal wrap pap...
4,for your own benefit you may want read living ...,1,benefit want read live buddha living christ th...
...,...,...,...
29995,well supporter modi you people hate modi just ...,-1,supporter modi people hate modi bcz 2002 riot ...
29996,pankaj tripathi mocking secularism the trailer...,-1,pankaj tripathi mock secularism trailer vivek ...
29997,nirav arrested\nchoksi running but will caught...,1,nirav arrest choksi run catch mallya bail good...
29998,pakistan has islamic country and built islam t...,0,pakistan islamic country build islam s whybeca...


In [18]:
# SPLIT TRAINING & TESTING DATA
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'],data['category'],test_size=0.2,shuffle=True, random_state=42)
print(X_train.shape, y_train.shape)

(23972,) (23972,)


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC 
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score, precision_score, f1_score, recall_score

def optimizer(data_train_input,data_train_target,model_type,data_test_input,data_test_target):
    # Classifier selection
    if model_type == "linear":
        kernel=['linear', 'rbf', 'poly', 'sigmoid']

        for i in kernel:
            classifier = SVC(kernel=i,C=1.0)
            tfidf = TfidfVectorizer()
            clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])
            model = clf.fit(data_train_input,data_train_target)
            print(model.score(data_test_input,data_test_target))

def sentiment_pipeline(data_train_input,data_train_target,model_type,data_test_input,data_test_target):
    # Classifier selection
    if model_type == "linear":
        classifier = SVC(random_state=42)
        optimizer(data_train_input,data_train_target,model_type,data_test_input,data_test_target)
    elif model_type == "logistic":
        classifier = LogisticRegression(max_iter=200)
    elif model_type == "sgd":
        classifier = SGDClassifier()
    elif model_type == "naive_bayes":
        classifier = MultinomialNB()
    elif model_type == "xgboost":
        classifier = XGBClassifier(use_label_encoder=False,eta=0.1,gamma=0.3, n_estimators=100, learning_rate=0.5, min_child_weight=5, 
        max_depth=5, colsample_bytree=0.7,objective="multi:softmax", eval_metric="mlogloss",verbosity=0)

    tfidf = TfidfVectorizer()

    # Pipeline setup
    clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])

    model = clf.fit(data_train_input,data_train_target)

    return model

def sentiment_model_predict(model,data_test_input,data_test_target):
    data_prediction=model.predict(data_test_input)
    conf_matrix = confusion_matrix(data_test_target,data_prediction)
    acc_score = accuracy_score(data_test_target, data_prediction)
    pre_score = precision_score(data_test_target, data_prediction, average="macro")
    re_score = recall_score(data_test_target, data_prediction, average="macro")
    f_score = f1_score(data_test_target, data_prediction, average="macro")

    print("Accuracy : "+str(round(acc_score*100,2)))
    print("Precision : "+str(round(pre_score*100,2)))
    print("Recall : "+str(round(re_score*100,2)))
    print("F1-Score :"+str(round(f_score*100,2)))
    print(conf_matrix)

In [26]:
# Support Vector Classification
model = sentiment_pipeline(X_train, y_train, 'linear',X_test,y_test)
sentiment_model_predict(model,X_test,y_test)

Accuracy : 76.89
Precision : 76.39
Recall : 75.63
F1-Score :75.85
[[ 913  202  274]
 [ 100 1684  240]
 [ 215  354 2012]]


In [27]:
# # Logistic Regression
# model = sentiment_pipeline(X_train, y_train, 'logistic')
# sentiment_model_predict(model,X_test,y_test)

In [28]:
# # Stochastic Gradient Descent
# model = sentiment_pipeline(X_train, y_train, 'sgd')
# sentiment_model_predict(model,X_test,y_test)

In [29]:
# # Multinomial Naive Bayes
# model = sentiment_pipeline(X_train, y_train, 'naive_bayes')
# sentiment_model_predict(model,X_test,y_test)

In [30]:
# # Xgboost
# # change category value -1 to value 2
# y_train_new = y_train.copy()
# y_test_new = y_test.copy()
# y_train_new.loc[y_train_new == -1] = 2
# y_test_new.loc[y_test_new == -1] = 2

# model = sentiment_pipeline(X_train, y_train_new, 'xgboost')
# sentiment_model_predict(model,X_test,y_test_new)