# Type-A: TF-IDF & LDA 
With this version, we are going to perform TF-IDF then LDA.  
We can then find similar documents from the corpus.  

## Libraries

In [22]:
# Imports 
import json 
import csv
import  itertools
from gensim import corpora, models, similarities
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
from smart_open import open
from bs4 import BeautifulSoup
from gensim.models import TfidfModel
from langdetect import detect

In [18]:
import ijson

## Pre processing

In [2]:
import pandas as pd

In [3]:
df = pd.read_json('./data/how-good-is-your-medium-article/test.json', lines=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34645 entries, 0 to 34644
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   _id         34645 non-null  object 
 1   _timestamp  34645 non-null  float64
 2   _spider     34645 non-null  object 
 3   url         34645 non-null  object 
 4   domain      34645 non-null  object 
 5   published   34645 non-null  object 
 6   title       34645 non-null  object 
 7   content     34645 non-null  object 
 8   author      34645 non-null  object 
 9   image_url   32711 non-null  object 
 10  tags        34645 non-null  object 
 11  link_tags   34645 non-null  object 
 12  meta_tags   34645 non-null  object 
dtypes: float64(1), object(12)
memory usage: 3.4+ MB


In [5]:
columns2keep = ['url', 'title', 'author', 'image_url']

In [6]:
df[columns2keep].head()

Unnamed: 0,url,title,author,image_url
0,https://medium.com/on-mornings/nocturnalmornin...,"For Night Owls, the Day Starts with a Nocturna...","{'name': None, 'url': 'https://medium.com/@HIT...",https://cdn-images-1.medium.com/focal/1200/632...
1,https://medium.com/wordsthatmatter/never-break...,Blockchain is Memory – Words That Matter – Medium,"{'name': None, 'url': 'https://medium.com/@mar...",https://cdn-images-1.medium.com/max/1200/1*taU...
2,https://medium.com/on-mornings/onmorningscredi...,ON MORNINGS Credits – On Mornings – Medium,"{'name': None, 'url': 'https://medium.com/@HIT...",https://cdn-images-1.medium.com/max/1200/1*ynE...
3,https://medium.com/@LanceUlanoff/apple-homepod...,Apple HomePod Review: Almost love – Lance Ulan...,"{'name': None, 'url': 'https://medium.com/@Lan...",https://cdn-images-1.medium.com/max/1200/1*b-Y...
4,https://blog.medium.com/tips-and-tricks-for-me...,Tips and tricks for Medium writers – 3 min read,"{'name': None, 'url': 'https://blog.medium.com...",


### Extract the content from the html tags

In [7]:
def extract_html_content(row, column_name='content'): 
    """
    Takes in a Row from a pandas DF. 
    Returns the text from the specified html in the given column-name
    
    Example: extract_html_content(df.iloc[0], 'content')
    """
    soup = BeautifulSoup(row[column_name], 'html.parser')
    return soup.getText()

In [9]:
df['html_text'] = df.apply(extract_html_content, axis='columns')

In [10]:
df.columns

Index(['_id', '_timestamp', '_spider', 'url', 'domain', 'published', 'title',
       'content', 'author', 'image_url', 'tags', 'link_tags', 'meta_tags',
       'html_text'],
      dtype='object')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34645 entries, 0 to 34644
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   _id         34645 non-null  object 
 1   _timestamp  34645 non-null  float64
 2   _spider     34645 non-null  object 
 3   url         34645 non-null  object 
 4   domain      34645 non-null  object 
 5   published   34645 non-null  object 
 6   title       34645 non-null  object 
 7   content     34645 non-null  object 
 8   author      34645 non-null  object 
 9   image_url   32711 non-null  object 
 10  tags        34645 non-null  object 
 11  link_tags   34645 non-null  object 
 12  meta_tags   34645 non-null  object 
 13  html_text   34645 non-null  object 
dtypes: float64(1), object(13)
memory usage: 3.7+ MB


### Save samples with the needed columns as CSVs 
Samples to save:  
* 500 samples  
* 1000 samples   
* 10000 samples  
* 30000 samples  
* Whole document  

In [12]:
columns2keep.append('html_text')
columns2keep

['url', 'title', 'author', 'image_url', 'html_text']

In [15]:
df[columns2keep].sample(2).to_json('sample.json', orient='records')

In [48]:
counts = [20]#, 1000, 5000, 10000, 25000]
for count in counts: 
    print(f'Starting {count} count saving...')
    df[columns2keep].sample(count).to_json(f'./data/samples/sample{count}.json', orient='records')
#     df[columns2keep].sample(count).to_csv(f'./data/samples/sample{count}.csv')

Starting 20 count saving...


In [17]:
# Save the whole document.
df[columns2keep].sample(frac=1).to_json(f'./data/samples/sampleFull.json', orient='records')

## Training Models

### Prep the Data 

In [100]:
# Load in Dataset 
class MyCorpusJSON: 
    def __init__(self, json_link, column): 
        # idx is the index of the row where the text content is
        self.json_link = json_link 
        self.text_column = column
        self.count = 0
    
    def __len__(self): 
        return self.count
    
    def get_nth(self, n): 
        return next(itertools.islice(self.generator(), n, None))
    
    def generator(self): 
        with open(self.json_link) as json_file: 
            parser = ijson.items(json_file, 'item')
            for obj in parser:
                yield obj
                
    def __iter__(self):  
        with open(self.json_link) as json_file: 
            parser = ijson.items(json_file, 'item')
            for obj in parser:
                self.count += 1
                yield obj[self.text_column]
        
#     def __iter__(self):  
#         with open(self.csv_link) as csv_file: 
#             reader = csv.reader(csv_file)
#             next(reader, None)
#             for row in reader:
#                 self.count += 1
#                 print(self.count)
#                 if self.count == 398: 
#                     print(row)
#                 yield row[-1]

#                 yield row[self.content_idx] # To get the whole doc, use this instead

In [101]:
link = './data/samples/sample5000.json'
text_column = 'html_text'
mycorpus = MyCorpusJSON(link, text_column)

In [55]:
# # Sample code. prefix: item.author, item.html_text ...
# with open(link) as f: 
#     parser = ijson.parse(f)
#     for prefix, event, value in parser:
#         print('prefix={}, event={}, value={}'.format(prefix, event, value))

prefix=, event=start_array, value=None
prefix=item, event=start_map, value=None
prefix=item, event=map_key, value=url
prefix=item.url, event=string, value=https://medium.com/@datadig/how-to-innovate-in-life-insurance-using-analytics-and-machine-learning-57e4430bbcd2
prefix=item, event=map_key, value=title
prefix=item.title, event=string, value=How to innovate in life insurance using Analytics and Machine Learning
prefix=item, event=map_key, value=author
prefix=item.author, event=start_map, value=None
prefix=item.author, event=map_key, value=name
prefix=item.author.name, event=null, value=None
prefix=item.author, event=map_key, value=url
prefix=item.author.url, event=string, value=https://medium.com/@datadig
prefix=item.author, event=map_key, value=twitter
prefix=item.author.twitter, event=string, value=@datadig
prefix=item.author, event=end_map, value=None
prefix=item, event=map_key, value=image_url
prefix=item.image_url, event=null, value=None
prefix=item, event=map_key, value=html_te

In [92]:
# Create a dictionary of all words in corpus - remove the stopwords in the process
dictionary = corpora.Dictionary()
# Add document to dictionary 
for doc in mycorpus: 
    dictionary.add_documents([doc.lower().split()])

In [93]:
dictionary.num_docs

5000

In [94]:
print(dictionary)

Dictionary(665663 unique tokens: ['(segue', '--apiobserve', '.', '1', '2.3rails']...)


In [95]:
# Remove stopwords 
stop_ids = [
    dictionary.token2id[stopword]
    for stopword in STOPWORDS
    if stopword in dictionary.token2id
]
# Remove id's that occur once 
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
dictionary.filter_tokens(once_ids + stop_ids)
dictionary.compactify()

In [146]:
print(dictionary)

Dictionary(147291 unique tokens: ['.', '1', '20,', '5,', '50']...)


## Testing Models

In [147]:
# Load in Dataset 
class BoWCorpusJSON: 
    def __init__(self, json_link, column, dictionary): 
        # idx is the index of the row where the text content is
        self.json_link = json_link 
        self.column_name = column
        self.count = 0
        self.dictionary = dictionary
    
    def __len__(self): 
        return self.count
        
    def __iter__(self):  
        with open(self.json_link) as json_file: 
            parser = ijson.items(json_file, 'item')
            for obj in parser:
                self.count += 1
#                 print(obj['url'])
                yield self.dictionary.doc2bow(obj[self.column_name].lower().split())
#         with open(self.csv_link) as csv_file: 
#             reader = csv.reader(csv_file)
#             next(reader, None)
#             for row in reader:
#                 self.count += 1
#                 yield dictionary.doc2bow(row[self.content_idx].lower().split())


## TF-IDF Models

In [148]:
bowcorpus = BoWCorpusJSON(link, text_column, dictionary)

In [165]:
# Save corpus
corpora.MmCorpus.serialize(f'./models/sample5000corpus.mm', bowcorpus)

In [166]:
# Load corpus
loaded_corpus = corpora.MmCorpus('./models/testing-corpus.mm')

In [152]:
tfidf_model = TfidfModel(bowcorpus, dictionary=dictionary)

In [150]:
next(bowcorpus.__iter__())

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 2),
 (4, 1),
 (5, 2),
 (6, 3),
 (7, 1),
 (8, 6),
 (9, 1),
 (10, 3),
 (11, 1),
 (12, 1),
 (13, 7),
 (14, 5),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 2),
 (24, 4),
 (25, 10),
 (26, 5),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 3),
 (31, 1),
 (32, 2),
 (33, 2),
 (34, 1),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 14),
 (40, 1),
 (41, 1),
 (42, 1),
 (43, 15),
 (44, 2),
 (45, 4),
 (46, 2),
 (47, 3),
 (48, 1),
 (49, 5),
 (50, 1),
 (51, 1),
 (52, 1),
 (53, 1),
 (54, 3),
 (55, 1),
 (56, 1),
 (57, 2),
 (58, 2),
 (59, 4),
 (60, 1),
 (61, 2),
 (62, 1),
 (63, 1),
 (64, 1),
 (65, 1),
 (66, 3),
 (67, 1),
 (68, 1),
 (69, 1),
 (70, 2),
 (71, 3),
 (72, 16),
 (73, 3),
 (74, 11),
 (75, 1),
 (76, 1),
 (77, 1),
 (78, 2),
 (79, 1),
 (80, 1),
 (81, 2),
 (82, 1),
 (83, 1),
 (84, 2),
 (85, 1),
 (86, 1),
 (87, 1),
 (88, 14),
 (89, 1),
 (90, 1),
 (91, 1),
 (92, 5),
 (93, 11),
 (94, 12),
 (95, 1),
 (96, 1),
 (97, 1),
 (98, 1),
 (99, 1),
 (

In [153]:
tfidf_model[next(bowcorpus.__iter__())]

[(0, 0.008831299994767733),
 (1, 0.005144301458324677),
 (2, 0.009658736321516476),
 (3, 0.01802064487559183),
 (4, 0.007665093992361952),
 (5, 0.041020037740073616),
 (6, 0.043321757661164056),
 (7, 0.016708361061564984),
 (8, 0.07879599366879654),
 (9, 0.01986820939312529),
 (10, 0.05205051161542952),
 (11, 0.013188516257622655),
 (12, 0.013024116204510414),
 (13, 0.1101841551384785),
 (14, 0.05699680645572014),
 (15, 0.012236907357305213),
 (16, 0.017874567063970764),
 (17, 0.020510018870036808),
 (18, 0.01065641273973226),
 (19, 0.010143731985636869),
 (20, 0.017350170538476505),
 (21, 0.016708361061564984),
 (22, 0.009794112738828912),
 (23, 0.01732554082115547),
 (24, 0.03876795106345363),
 (25, 0.08044440393862709),
 (26, 0.06138852729198212),
 (27, 0.012818584653076785),
 (28, 0.022503661199191335),
 (29, 0.022503661199191335),
 (30, 0.04764277420444872),
 (31, 0.017350170538476505),
 (32, 0.03269672917836592),
 (33, 0.03520086843722998),
 (34, 0.005464210621143038),
 (35, 0.00

In [163]:
len(next(bowcorpus.__iter__()))

400

### Convert a sample text to TFIDF document

In [154]:
text = 'Using machine learning, one can find useful patterns from large data sets to make'\
'data more informative and qualitatively insightful. This is very important for'\
'decision making. Students will be exposed to supervised and unsupervised'\
'learning, respectively.'

In [155]:
text_words = simple_preprocess(text)
tfidf_vec = tfidf_model[dictionary.doc2bow(text_words)]
print(tfidf_vec)

[(96, 0.09473348576136774), (819, 0.11450882541465239), (855, 0.1678771767289562), (863, 0.07551095053400037), (1736, 0.11818518845932854), (6112, 0.14854943471979581), (6547, 0.23593639991792315), (6589, 0.07383963477463865), (6658, 0.21179327810422055), (7272, 0.19283204746820914), (10638, 0.20681875568365463), (12783, 0.3080069186654508), (24031, 0.3967431871677301), (38904, 0.33148525554037733), (43184, 0.3705862192511217), (93463, 0.4717743822329179)]


## LDA Model

In [167]:
lda_model = models.LdaModel(loaded_corpus, num_topics=200)

In [168]:
# See the topic distribution for sample text 
lda_vec = lda_model[dictionary.doc2bow(text_words)]

In [169]:
# Use LDA for similarity index 
lda_index = similarities.Similarity(None, lda_model[bowcorpus], len(dictionary))

## Get similar documents from the corpus

In [170]:
index = similarities.Similarity(None, tfidf_model[bowcorpus], len(dictionary))

In [171]:
res = index[tfidf_vec]

In [172]:
# Print top 10 documents 
print(list(sorted(enumerate(res), key=lambda x: x[1], reverse=True))[:10])

[(3693, 0.17650777), (2625, 0.16488062), (1011, 0.13188729), (3594, 0.121433794), (1743, 0.10338302), (3644, 0.09636955), (540, 0.09168525), (1152, 0.08723651), (3883, 0.08603429), (3622, 0.08497484)]


In [173]:
# Get the nth value from the corpus to confirm 
import itertools
index = 3693
mycorpus.get_nth(index)
# next(itertools.islice(mycorpus.get_nth(), index, None))

{'url': 'https://medium.com/bits-and-behavior/we-need-to-learn-how-to-teach-machine-learning-acc78bac3ff8',
 'title': 'We need to learn how to teach machine learning – Bits and Behavior – Medium',
 'author': {'name': None,
  'url': 'https://medium.com/@andyjko',
  'twitter': '@andyjko'},
 'image_url': 'https://cdn-images-1.medium.com/max/1200/1*SJvlRyU6oymSDjvxK7uiDQ.jpeg',
 'html_text': 'Andy J. KoAssociate Professor @UW_iSchool, Chief Scientist+Co-Founder @answerdash. Father, feminist, scientist, teacher, inventor, programmer, human.Aug 21, 2017We need to learn how to teach machine\xa0learningJackson Pollock’s ”Untitled.” I think it represents most students understanding of machine learning after taking a\xa0course.This is a revised version of a position paper I wrote for the ICER 2017 workshop “Learning about Machine Learning.”Knowledge of how to apply machine learning to products is on high demand but low supply. Journalists write endlessly about it, employers want engineers who ha

## Saving Models

In [175]:
# Save dictionary & corpus
dictionary.save('./models/dictionary/sample5000Dict')
corpora.MmCorpus.serialize(f'./models/sample5000corpus.mm', bowcorpus)

In [176]:
# Save the tfidf model 
tfidf_model.save('./models/tfidf-sample5000')

In [None]:
# Save the LDA model 

In [177]:
lda_model.save('./models/lda-sample5000')