# Imports and Downloads

In [None]:
import re
import nltk
import spacy
import string
import itertools
import numpy as np
import pandas as pd
from empath import Empath
import scipy.sparse as sp
from sklearn import metrics
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
!pip install empath

# Import Dataset

In [None]:
sample = pd.read_csv("sample.tsv", delimiter='\t', encoding='utf-8')

print(sample.shape)
print(sample.columns)
sample.head()

In [None]:
sample.loc[sample['our rating'] == 'true', 'our rating'] = 'TRUE'
sample.loc[sample['our rating'] == 'false', 'our rating'] = 'FALSE'
sample.loc[sample['our rating'] == 'partially false', 'our rating'] = 'PARTIALLY FALSE'
sample.loc[sample['our rating'] == 'other', 'our rating'] = 'OTHER'
sample.drop(['public_id'], axis = 1, inplace = True)
sample.head()

# Combine the Datasets

In [None]:
sample['text'] = sample['title'] + " " + sample['text']

sample.drop(['title'], axis = 1, inplace = True)

print(sample['text'][0])

# Data Exploration

In [None]:
sample['our rating'].value_counts()
sample['our rating'].value_counts().plot(kind = 'bar')

# Data Preparation

In [None]:
def clean_text(text):
  text = re.sub('['+string.punctuation+']','', text)
  text = re.sub(r"[-()\"#/@’;:<>{}`+=~|.!?,]", '', text)
  text = text.lower().split()

  stops = set(stopwords.words("english"))
  text = [w for w in text if w not in stops]
  text = " ".join(text)
  
  text = re.sub(r'[^a-zA-Z\s]', u'', text, flags=re.UNICODE)
  
  text = text.split()
  l = WordNetLemmatizer()
  lemmatized_words = [l.lemmatize(word) for word in text]
  text = " ".join(lemmatized_words)
    
  return text

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()

In [None]:
sample['clean_text'] = sample['text'].apply(lambda x: clean_text(x))
sample.head()

# Cleaned Text: Splitting into training and testing

In [None]:
y = sample['our rating'].astype('str') 
X_train, X_test, y_train, y_test = train_test_split(sample['clean_text'], y, test_size = 0.2, random_state = 42)

print(X_train.head())
print()
print(y_train.head())

#TFIDF Vectorization

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range = (2, 2))

tfidf_train = tfidf_vectorizer.fit_transform(X_train)

tfidf_test = tfidf_vectorizer.transform(X_test)

print(tfidf_vectorizer.get_feature_names()[:10])

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()

# Naive-Bayes

In [None]:
nb_classifier = MultinomialNB(alpha=0.1)
nb_classifier.fit(tfidf_train, y_train)

pred = nb_classifier.predict(tfidf_test)

print(classification_report(y_test, pred))

cm = metrics.confusion_matrix(y_test, pred, labels=['FALSE', 'TRUE', 'PARTIALLY FALSE', 'OTHER'])
print('Confusion Matrix: ')
print(cm)
plot_confusion_matrix(cm, classes=['FALSE', 'TRUE', 'PARTIALLY', 'OTHER'])

# Alpha tuning for Naive-Bayes

In [None]:
alphas = np.arange(0, 1, 0.1)

def train_and_predict(alpha):
    nb_classifier = MultinomialNB(alpha=alpha)
    nb_classifier.fit(tfidf_train, y_train)
    pred = nb_classifier.predict(tfidf_test)
    score = metrics.accuracy_score(y_test, pred)
    return score

In [None]:
for alpha in alphas:
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print()

# Random Forest

In [None]:
rf_classifier = RandomForestClassifier(verbose=True)

rf_classifier.fit(tfidf_train, y_train)

pred = rf_classifier.predict(tfidf_test)

print(classification_report(y_test, pred))

cm = metrics.confusion_matrix(y_test, pred, labels=['FALSE', 'TRUE', 'PARTIALLY FALSE', 'OTHER'])
print('Confusion Matrix: ')
print(cm)
plot_confusion_matrix(cm, classes=['FALSE', 'TRUE', 'PARTIALLY', 'OTHER'])

# Gradient Boosting

In [None]:
gb_classifier = GradientBoostingClassifier(verbose=True)

gb_classifier.fit(tfidf_train, y_train)

pred = gb_classifier.predict(tfidf_test)

print(classification_report(y_test, pred))

cm = metrics.confusion_matrix(y_test, pred, labels=['FALSE', 'TRUE', 'PARTIALLY FALSE', 'OTHER'])
print('Confusion Matrix: ')
print(cm)
plot_confusion_matrix(cm, classes=['FALSE', 'TRUE', 'PARTIALLY', 'OTHER'])

# POS Tagging

In [None]:
nlp = spacy.load('en')

pos_tags_column = []

for text in sample['text']:
    pos_tags = []
    doc = nlp(text)
    for token in doc:
        pos_tags.append(token.pos_)
    all_pos_tags = ' '.join(pos_tags)
    pos_tags_column.append(all_pos_tags)
    
sample['POS_text'] = pos_tags_column

sample.head()

In [None]:
y = sample['our rating'].astype('str')

X_train, X_test, y_train, y_test = train_test_split(sample['POS_text'], y, test_size = 0.2, random_state = 42)
print(X_train.head())
print(y_train.head())

# TFIDF for POS_text

In [None]:
pos_tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range = (2,2))
pos_tfidf_train = pos_tfidf_vectorizer.fit_transform(X_train.astype('str'))
pos_tfidf_test= pos_tfidf_vectorizer.transform(X_test.astype('str'))
pos_tfidf_vectorizer.get_feature_names()[:10]

# Naive-Bayes

In [None]:
nb_classifier = MultinomialNB(alpha=0.1)
nb_classifier.fit(pos_tfidf_train, y_train)
pred = nb_classifier.predict(pos_tfidf_test)

print(classification_report(y_test, pred))

# confusion matrix
cm = metrics.confusion_matrix(y_test, pred, labels=['FALSE', 'TRUE', 'PARTIALLY FALSE', 'OTHER'])
plot_confusion_matrix(cm, classes=['FALSE', 'TRUE', 'PARTIALLY', 'OTHER'])

# Random Forest

In [None]:
rf_classifier = RandomForestClassifier(verbose=True)
rf_classifier.fit(pos_tfidf_train, y_train)
pred = rf_classifier.predict(pos_tfidf_test)

print(classification_report(y_test, pred))

cm = metrics.confusion_matrix(y_test, pred, labels=['FALSE', 'TRUE', 'PARTIALLY FALSE', 'OTHER'])
plot_confusion_matrix(cm, classes=['FALSE', 'TRUE', 'PARTIALLY', 'OTHER'])

# Gradient Boosting

In [None]:
gb_classifier = GradientBoostingClassifier(verbose=True)
gb_classifier.fit(pos_tfidf_train, y_train)
pred = gb_classifier.predict(pos_tfidf_test)

print(classification_report(y_test, pred))

cm = metrics.confusion_matrix(y_test, pred, labels=['FALSE', 'TRUE', 'PARTIALLY FALSE', 'OTHER'])
plot_confusion_matrix(cm, classes=['FALSE', 'TRUE', 'PARTIALLY', 'OTHER'])

# Semantic Analysis

In [None]:
lexicon = Empath()
semantic = []
count = 0

for article in sample['text']:
    d = lexicon.analyze(article, normalize=False)
    x = []
    for key, value in d.items():
        x.append(value)
    x = np.asarray(x)
    semantic.append(x)
sample['semantic_text'] = semantic
print(sample['semantic_text'].head())

In [None]:
categories = []
a = lexicon.analyze("")
for key, value in a.items():
    categories.append(key)
categories

In [None]:
sem = []
for i in range(sample.shape[0]):
    a = []
    for j in range(len(semantic[0])):
        for k in range(int(semantic[i][j])):
            a.append(categories[j])
    b = " ".join(a)
    sem.append(b)
sample['semantics_text'] = sem

print(sample['semantics_text'].head())

In [None]:
y = sample['our rating'].astype('str')
X_train, X_test, y_train, y_test = train_test_split(sample['semantics_text'], y, test_size = 0.2, random_state = 42)
print(X_train.head())
print(y_train.head())

In [None]:
sem_tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,1))
sem_tfidf_train = sem_tfidf_vectorizer.fit_transform(X_train.astype('str'))
sem_tfidf_test = sem_tfidf_vectorizer.transform(X_test.astype('str'))

In [None]:
nb_classifier = MultinomialNB(alpha=0.1)
nb_classifier.fit(sem_tfidf_train, y_train)
pred = nb_classifier.predict(sem_tfidf_test)

print(classification_report(y_test, pred))

cm = metrics.confusion_matrix(y_test, pred, labels=['FALSE', 'TRUE', 'PARTIALLY FALSE', 'OTHER'])
plot_confusion_matrix(cm, classes=['FALSE', 'TRUE', 'PARTIALLY', 'OTHER'])

In [None]:
rf_classifier = RandomForestClassifier(verbose=True)
rf_classifier.fit(sem_tfidf_train, y_train)
pred = rf_classifier.predict(sem_tfidf_test)

print(classification_report(y_test, pred))

cm = metrics.confusion_matrix(y_test, pred, labels=['FALSE', 'TRUE', 'PARTIALLY FALSE', 'OTHER'])
plot_confusion_matrix(cm, classes=['FALSE', 'TRUE', 'PARTIALLY', 'OTHER'])

In [None]:
gb_classifier = GradientBoostingClassifier(verbose=True)
gb_classifier.fit(sem_tfidf_train, y_train)
pred = gb_classifier.predict(sem_tfidf_test)

print(classification_report(y_test, pred))

cm = metrics.confusion_matrix(y_test, pred, labels=['FALSE', 'TRUE', 'PARTIALLY FALSE', 'OTHER'])
plot_confusion_matrix(cm, classes=['FALSE', 'TRUE', 'PARTIALLY', 'OTHER'])

# Three-layer Classification

In [None]:
print(sample.columns)
print(sample.shape)

X = sample.drop('our rating', axis = 1)

print(X.columns)

In [None]:
y = sample['our rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print(y)

In [None]:
X_train_text = X_train['clean_text']
X_test_text = X_test['clean_text']

X_train_POS = X_train['POS_text']
X_test_POS = X_test['POS_text']

X_train_sem = X_train['semantics_text']
X_test_sem = X_test['semantics_text']

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range = (1,3), max_features=20000)
tfidf_train = tfidf_vectorizer.fit_transform(X_train_text.astype('str'))
tfidf_test = tfidf_vectorizer.transform(X_test_text.astype('str'))

In [None]:
pos_tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range = (1,3), max_features=20000)
pos_tfidf_train = pos_tfidf_vectorizer.fit_transform(X_train_POS.astype('str'))
pos_tfidf_test = pos_tfidf_vectorizer.transform(X_test_POS.astype('str'))

In [None]:
sem_tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range = (1,1), max_features=20000)
sem_tfidf_train = sem_tfidf_vectorizer.fit_transform(X_train_sem.astype('str'))
sem_tfidf_test = sem_tfidf_vectorizer.transform(X_test_sem.astype('str'))

In [None]:
text_w = 0.5 * 3
pos_w = 0.15 * 3
sem_w = 0.35 * 3

tfidf_train *= text_w
tfidf_test *= text_w
pos_tfidf_train *= pos_w
pos_tfidf_test *= pos_w
sem_tfidf_train *= sem_w
sem_tfidf_train *= sem_w

In [None]:
diff_n_rows = pos_tfidf_train.shape[0] - tfidf_train.shape[0]
b = sp.vstack((tfidf_train, sp.csr_matrix((diff_n_rows, tfidf_train.shape[1]))))
c = sp.hstack((pos_tfidf_train, b))

diff_n_rows = c.shape[0] - sem_tfidf_train.shape[0]
b = sp.vstack((sem_tfidf_train, sp.csr_matrix((diff_n_rows, sem_tfidf_train.shape[1]))))

X_train = sp.hstack((c, b))

diff_n_rows = pos_tfidf_test.shape[0] - tfidf_test.shape[0]
d = sp.vstack((tfidf_test, sp.csr_matrix((diff_n_rows, tfidf_test.shape[1]))))
e = sp.hstack((pos_tfidf_test, d))

diff_n_rows = e.shape[0] - sem_tfidf_test.shape[0]
d = sp.vstack((sem_tfidf_test, sp.csr_matrix((diff_n_rows, sem_tfidf_test.shape[1]))))

X_test = sp.hstack((e, d))

# Naive-Bayes

In [None]:
nb_classifier = MultinomialNB(alpha=0.1)
nb_classifier.fit(X_train, y_train)
pred = nb_classifier.predict(X_test)

print(classification_report(y_test, pred))

cm = metrics.confusion_matrix(y_test, pred, labels=['FALSE', 'TRUE', 'PARTIALLY FALSE', 'OTHER'])
plot_confusion_matrix(cm, classes=['FALSE', 'TRUE', 'PARTIALLY', 'OTHER'])

# Random Forest

In [None]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
pred = rf_classifier.predict(X_test)

print(classification_report(y_test, pred))

cm = metrics.confusion_matrix(y_test, pred, labels=['FALSE', 'TRUE', 'PARTIALLY FALSE', 'OTHER'])
plot_confusion_matrix(cm, classes=['FALSE', 'TRUE', 'PARTIALLY', 'OTHER'])

# Gradient Boosting

In [None]:
gb_classifier = GradientBoostingClassifier(verbose=True)
gb_classifier.fit(X_train, y_train)
pred = gb_classifier.predict(X_test)

print(classification_report(y_test, pred))

cm = metrics.confusion_matrix(y_test, pred, labels=['FALSE', 'TRUE', 'PARTIALLY FALSE', 'OTHER'])
plot_confusion_matrix(cm, classes=['FALSE', 'TRUE', 'PARTIALLY', 'OTHER'])
