## Raw text classification with BOW

In [1]:
# Load for Jupyter Notebook
import sys
sys.path.append('/home/elenaruiz/Documents/TFG/FNC')

In [17]:
import pandas as pd
import numpy as np
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim import corpora, models

from sklearn.model_selection import train_test_split

from src.utils import io
from src.fake_news_detector.core.nlp import clean_text as ct

### 1. Load dataset (tmp.json)

The file `tmp.json` contains all raw data that has to be tokenize

In [3]:
articles = io.read_json_file('/home/elenaruiz/Documents/TFG/FNC/src/data/tmp.json')
df = pd.DataFrame(data=articles['articles']) # Put in pandas dataframe

In [4]:
df.columns

Index(['fake', 'subtitle', 'text', 'title', 'url'], dtype='object')

### 2. Join and clean raw text

Tokenize in words. For each article store list of Tokens

In [5]:
corpus = []
i = 0
for _, row in df.iterrows():
    x = ct.clean_text_by_word(row['title'], True)
    y = ct.clean_text_by_word(row['subtitle'], True)
    z = []
    for sent in row['text']:
        z += ct.clean_text_by_word(sent, True)
    corpus.append(x + y + z)
    i = i + 1
len(corpus)

101

In [6]:
#Store in a dataframe
documents = pd.DataFrame(data={'corpus': corpus, 'label': df['fake']*1 })

In [7]:
documents.head()

Unnamed: 0,corpus,label
0,"[they, find, corpse, vegetarian, restaurant, B...",1
1,"[switzerland, warn, authorize, extradition, po...",1
2,"[navarre, censor, Songs, Amaral, Shakira, song...",1
3,"[a, woman, pretend, blind, years, greet, peopl...",1
4,"[arrested, ejaculate, boss, coffee, last, four...",1


### 3. Bag of Words
Create dictionary

In [9]:
dictionary = gensim.corpora.Dictionary(documents['corpus'])

In [10]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 A
1 Asians
2 Bangkok
3 Daily
4 Errors
5 Inpathom
6 Khaosod
7 Mail
8 October
9 Prasit
10 Restaurant


In [11]:
# Se quitan los menos usados
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

### 4. Vectorizar el contenido
Realizar proceso de doc2bow:

In [14]:
bow_corpus = [dictionary.doc2bow(doc) for doc in documents['corpus']]


[(3, 1),
 (4, 2),
 (11, 7),
 (15, 1),
 (17, 2),
 (22, 1),
 (25, 1),
 (30, 2),
 (35, 3),
 (40, 1),
 (41, 1),
 (52, 1),
 (64, 2),
 (67, 1),
 (70, 1),
 (77, 1),
 (81, 1),
 (83, 1),
 (89, 1),
 (99, 1),
 (101, 1)]

In [16]:
# EXAMPLE
bow_corpus[30]
bow_doc_30 = bow_corpus[30]

for i in range(len(bow_doc_30)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_30[i][0], 
                                                     dictionary[bow_doc_30[i][0]], 
                                                     bow_doc_30[i][1]))

Word 2 ("find") appears 1 time.
Word 5 ("he") appears 1 time.
Word 7 ("like") appears 1 time.
Word 8 ("make") appears 2 time.
Word 13 ("say") appears 4 time.
Word 16 ("time") appears 2 time.
Word 19 ("accord") appears 2 time.
Word 23 ("case") appears 3 time.
Word 32 ("seem") appears 1 time.
Word 46 ("it") appears 4 time.
Word 50 ("explain") appears 1 time.
Word 51 ("many") appears 1 time.
Word 52 ("people") appears 1 time.
Word 54 ("this") appears 1 time.
Word 55 ("years") appears 2 time.
Word 60 ("could") appears 1 time.
Word 62 ("point") appears 1 time.
Word 63 ("that") appears 1 time.
Word 64 ("another") appears 2 time.
Word 68 ("even") appears 1 time.
Word 69 ("take") appears 3 time.
Word 75 ("European") appears 2 time.
Word 81 ("we") appears 2 time.
Word 91 ("go") appears 1 time.
Word 95 ("still") appears 1 time.
Word 98 ("become") appears 1 time.
Word 100 ("next") appears 1 time.


### TF-IDF
Realizar proceso de ordenar por frecuencias

In [18]:
tfidf = models.TfidfModel(bow_corpus)

In [19]:
corpus_tfidf = tfidf[bow_corpus]

from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.1433809580681048),
 (1, 0.1263779958704988),
 (2, 0.4246240358860827),
 (3, 0.11563340754452661),
 (4, 0.11905796208001264),
 (5, 0.13030389088330102),
 (6, 0.08080304837609287),
 (7, 0.10615600897152068),
 (8, 0.07060398679259744),
 (9, 0.14825911136998027),
 (10, 0.13030389088330102),
 (11, 0.7412955568499014),
 (12, 0.29651822273996054),
 (13, 0.059859398466625226),
 (14, 0.1263779958704988),
 (15, 0.11563340754452661),
 (16, 0.08080304837609287)]


In [20]:
### 5. LDA con BOW
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [21]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.053*"say" + 0.040*"I" + 0.037*"police" + 0.035*"right" + 0.028*"people" + 0.024*"know" + 0.024*"another" + 0.022*"country" + 0.021*"call" + 0.021*"case"
Topic: 1 
Words: 0.044*"use" + 0.037*"already" + 0.036*"point" + 0.033*"public" + 0.033*"they" + 0.029*"come" + 0.027*"consider" + 0.025*"way" + 0.025*"many" + 0.024*"in"
Topic: 2 
Words: 0.045*"say" + 0.041*"police" + 0.036*"euros" + 0.033*"would" + 0.028*"first" + 0.027*"in" + 0.024*"use" + 0.022*"new" + 0.021*"find" + 0.021*"two"
Topic: 3 
Words: 0.049*"in" + 0.045*"use" + 0.041*"make" + 0.039*"first" + 0.034*"open" + 0.033*"euros" + 0.031*"go" + 0.029*"since" + 0.028*"new" + 0.026*"it"
Topic: 4 
Words: 0.048*"would" + 0.045*"time" + 0.040*"could" + 0.035*"end" + 0.033*"say" + 0.029*"years" + 0.026*"company" + 0.025*"case" + 0.023*"European" + 0.023*"want"
Topic: 5 
Words: 0.053*"it" + 0.030*"time" + 0.028*"take" + 0.026*"they" + 0.025*"years" + 0.024*"a" + 0.024*"go" + 0.023*"people" + 0.022*"say" + 0.021*"he"
To

https://github.com/susanli2016/NLP-with-Python/blob/master/LDA_news_headlines.ipynb