## Experiment 2: BOW/TFIDF + LDA + LR

In [None]:
# Load for Jupyter Notebook
import sys
sys.path.append('/home/elenaruiz/Documents/FNC')
import pandas as pd 
import numpy as np 
from src.utils import io

from src.fake_news_detector.core.encoders import bow as b
from src.fake_news_detector.core.encoders import tfidf as t
from src.fake_news_detector.core.classificators import latent_analysis as la

## 1. Import `dataset_raw.json`

In [None]:
articles = io.read_json_file('/home/elenaruiz/Documents/FNC/src/data/dataset_raw.json')
df = pd.DataFrame(data=articles['articles']) # Put in pandas dataframe

In [None]:
# Get useful info from our dataset:
corpus = pd.DataFrame()
corpus['corpus'] = df['all_word']
corpus['label'] = df['fake']*1
len(df)
corpus.head()

## 2. Dictionary creation and word vectorization

In [None]:
output = True
filter_by_freq = True

### 2.1 BOW encoding

In [None]:
corpus = list(corpus['corpus'])
filter_by_freq = True
ouput = True

corpus['bow_encoding'], dictionary = b.bow_encoding(list(df['all_word']),True, True, 0.6, 2)

### 2.2 TF-IDF


In [None]:
#tfidf_encoding, tfidf_dictionary = b.run_tfidf(df['corpus'], filter_by_freq, output)
df['tfidf_encoding'] = t.tfidf_encoding(list(corpus['bow_encoding']))

## 3. Topic modeling with LDA

Separate train and test data

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df_content, test_size=0.2, random_state=42)

df_train['id'] = list(range(0,len(df)))

Define function for showing top topics of fake and real news

In [None]:
import matplotlib.pyplot as plt

def print_top_topics_by_label(lda_model, corpus, labels):
    for label in labels:
        sub_corpus = corpus.loc[corpus['label'] == label]
        topic_distribution = sub_corpus['lda_featrues'].mean()
        
        x = range(0, len(topic_distribution))
        plt.bar(x, topic_distribution, color="blue")
        
        print("Looking up top words from top topics from", label)
        for x in sorted(np.argsort(topic_distribution)[-5:]):
            top_words = lda.get_top_words_by_id(lda_model, x)
            print("For topic {}, the top words are: {}.".format(x, ", ".join(top_words)))
    

### 3.1 BOW

Create LDA model with BOW encoding and show top words of each topic:

In [None]:
bow_lda_model = la.create_LDA(df_train['bow_encoding'], dictionary)
lda.print_top_words(bow_lda_model)

Store topic distribution of each document in dataset:

In [None]:
df_train['bow_lda_featrues'] = ld.get_topics_distribution_by_doc(bow_lda_model, dictionary, list(df_train['corpus']))

Show top topics with its words of real and fake news. (From train dataset)

In [None]:
labels = [0, 1]
print_top_topics_by_label(bow_lda_model, list(df_train['corpus']), labels)

The same with test data

In [None]:
df_test['bow_lda_features'] = la.get_all_topic_predictions(lda_model, df_test['corpus'])
print_top_topics_by_label(bow_lda_model, list(df_train['corpus']), labels)

In [None]:
#Model evaluation
ld.model_evaluation(bow_lda_model, list(df_test['corpus']))

#TODO: Check a random document

### 3.2 TF-IDF

In [None]:
tfidf_lsi_model = la.create_LSI(df_train['tfidf_encoding'], dictionary)
lda.print_top_words(tfidf_lda_model)

In [None]:
df_train['tfidf_lda_featrues'] = lda.get_topics_distribution_by_doc(tfidf_lda_model, dictionary, list(df_train['corpus']))

In [None]:
labels = [0, 1]
print_top_topics_by_label(tfidf_lda_model, list(df_train['corpus']), labels)

In [None]:
df_test['tfidf_lda_features'] = lda.get_all_topic_predictions(lda_model, df_test['corpus'])
print_top_topics_by_label(tfidf_lda_model, list(df_train['corpus']), labels)

In [None]:
#Model evaluation
ld.model_evaluation(tfidf_lsi_model, list(df_test['corpus']))

## 4. Prediction with LR

In [None]:
from src.fake_news_detector.core.classificator import lr, helpers

y_train = np.array(list(df_train.label))
y_test = np.array(list(df_test.label))

### 4.1 BOW

In [None]:
#Prepare train and set data input
X_train = np.array(list(map(np.array, df_train.bow_lda_features)))
X_test = np.array(list(map(np.array, df_test.bow_lda_features)))


In [None]:
# Evaluate with LR
lr_model = lr.create_model()
lr.train_model(lr_model, X_train, y_train)
y_pred = lr.predict(lr_model, X_test)

helpers.print_evaluation(lr_model, X_train, y_train, y_test, y_pred)

### 4.1 TFIDF

In [None]:
#Prepare train and set data input
X_train = np.array(list(map(np.array, df_train.tfidf_lda_features)))
X_test = np.array(list(map(np.array, df_test.tfidf_lda_features)))

In [None]:
# Evaluate with LR
lr_model = lr.create_model()
lr.train_model(lr_model, X_train, y_train)
y_pred = lr.predict(lr_model, X_test)

helpers.print_evaluation(lr_model, X_train, y_train, y_test, y_pred)