# Type-A: TF-IDF & LDA 
With this version, we are going to perform TF-IDF then LDA.  
We can then find similar documents from the corpus.  

## Libraries

In [2]:
# Imports 
import json 
import csv
import  itertools
from gensim import corpora, models, similarities
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
from smart_open import open
from bs4 import BeautifulSoup
from gensim.models import TfidfModel
from langdetect import detect



In [3]:
import ijson

## Pre processing

In [2]:
import pandas as pd

In [3]:
df = pd.read_json('./data/how-good-is-your-medium-article/test.json', lines=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34645 entries, 0 to 34644
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   _id         34645 non-null  object 
 1   _timestamp  34645 non-null  float64
 2   _spider     34645 non-null  object 
 3   url         34645 non-null  object 
 4   domain      34645 non-null  object 
 5   published   34645 non-null  object 
 6   title       34645 non-null  object 
 7   content     34645 non-null  object 
 8   author      34645 non-null  object 
 9   image_url   32711 non-null  object 
 10  tags        34645 non-null  object 
 11  link_tags   34645 non-null  object 
 12  meta_tags   34645 non-null  object 
dtypes: float64(1), object(12)
memory usage: 3.4+ MB


In [5]:
columns2keep = ['url', 'title', 'author', 'image_url']

In [6]:
df[columns2keep].head()

Unnamed: 0,url,title,author,image_url
0,https://medium.com/on-mornings/nocturnalmornin...,"For Night Owls, the Day Starts with a Nocturna...","{'name': None, 'url': 'https://medium.com/@HIT...",https://cdn-images-1.medium.com/focal/1200/632...
1,https://medium.com/wordsthatmatter/never-break...,Blockchain is Memory – Words That Matter – Medium,"{'name': None, 'url': 'https://medium.com/@mar...",https://cdn-images-1.medium.com/max/1200/1*taU...
2,https://medium.com/on-mornings/onmorningscredi...,ON MORNINGS Credits – On Mornings – Medium,"{'name': None, 'url': 'https://medium.com/@HIT...",https://cdn-images-1.medium.com/max/1200/1*ynE...
3,https://medium.com/@LanceUlanoff/apple-homepod...,Apple HomePod Review: Almost love – Lance Ulan...,"{'name': None, 'url': 'https://medium.com/@Lan...",https://cdn-images-1.medium.com/max/1200/1*b-Y...
4,https://blog.medium.com/tips-and-tricks-for-me...,Tips and tricks for Medium writers – 3 min read,"{'name': None, 'url': 'https://blog.medium.com...",


### Extract the content from the html tags

In [7]:
def extract_html_content(row, column_name='content'): 
    """
    Takes in a Row from a pandas DF. 
    Returns the text from the specified html in the given column-name
    
    Example: extract_html_content(df.iloc[0], 'content')
    """
    soup = BeautifulSoup(row[column_name], 'html.parser')
    return soup.getText()

In [9]:
df['html_text'] = df.apply(extract_html_content, axis='columns')

In [10]:
df.columns

Index(['_id', '_timestamp', '_spider', 'url', 'domain', 'published', 'title',
       'content', 'author', 'image_url', 'tags', 'link_tags', 'meta_tags',
       'html_text'],
      dtype='object')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34645 entries, 0 to 34644
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   _id         34645 non-null  object 
 1   _timestamp  34645 non-null  float64
 2   _spider     34645 non-null  object 
 3   url         34645 non-null  object 
 4   domain      34645 non-null  object 
 5   published   34645 non-null  object 
 6   title       34645 non-null  object 
 7   content     34645 non-null  object 
 8   author      34645 non-null  object 
 9   image_url   32711 non-null  object 
 10  tags        34645 non-null  object 
 11  link_tags   34645 non-null  object 
 12  meta_tags   34645 non-null  object 
 13  html_text   34645 non-null  object 
dtypes: float64(1), object(13)
memory usage: 3.7+ MB


### Save samples with the needed columns as CSVs 
Samples to save:  
* 500 samples  
* 1000 samples   
* 10000 samples  
* 30000 samples  
* Whole document  

In [12]:
columns2keep.append('html_text')
columns2keep

['url', 'title', 'author', 'image_url', 'html_text']

In [15]:
df[columns2keep].sample(2).to_json('sample.json', orient='records')

In [48]:
counts = [20, 1000, 5000, 10000, 25000]
for count in counts: 
    print(f'Starting {count} count saving...')
    df[columns2keep].sample(count).to_json(f'./data/samples/sample{count}.json', orient='records')
#     df[columns2keep].sample(count).to_csv(f'./data/samples/sample{count}.csv')

Starting 20 count saving...


In [17]:
# Save the whole document.
df[columns2keep].sample(frac=1).to_json(f'./data/samples/sampleFull.json', orient='records')

## Training Models

### Prep the Data 

In [4]:
# Load in Dataset 
class MyCorpusJSON: 
    def __init__(self, json_link, column): 
        # idx is the index of the row where the text content is
        self.json_link = json_link 
        self.text_column = column
        self.count = 0
    
    def __len__(self): 
        return self.count
    
    def get_nth(self, n): 
        return next(itertools.islice(self.generator(), n, None))
    
    def generator(self): 
        with open(self.json_link) as json_file: 
            parser = ijson.items(json_file, 'item')
            for obj in parser:
                yield obj
                
    def __iter__(self):  
        with open(self.json_link) as json_file: 
            parser = ijson.items(json_file, 'item')
            for obj in parser:
                self.count += 1
                yield obj[self.text_column]
        
#     def __iter__(self):  
#         with open(self.csv_link) as csv_file: 
#             reader = csv.reader(csv_file)
#             next(reader, None)
#             for row in reader:
#                 self.count += 1
#                 print(self.count)
#                 if self.count == 398: 
#                     print(row)
#                 yield row[-1]

#                 yield row[self.content_idx] # To get the whole doc, use this instead

In [18]:
link = './data/samples/sample10000.json'
text_column = 'html_text'
mycorpus = MyCorpusJSON(link, text_column)

In [19]:
# # Sample code. prefix: item.author, item.html_text ...
# with open(link) as f: 
#     parser = ijson.parse(f)
#     for prefix, event, value in parser:
#         print('prefix={}, event={}, value={}'.format(prefix, event, value))

In [20]:
# Create a dictionary of all words in corpus - remove the stopwords in the process
dictionary = corpora.Dictionary()
# Add document to dictionary 
for doc in mycorpus: 
    dictionary.add_documents([doc.lower().split()])

In [21]:
dictionary.num_docs

10000

In [22]:
print(dictionary)

Dictionary(1115039 unique tokens: ['#1', '#1with', '#3:', '#swissep', '&']...)


In [23]:
# Remove stopwords 
stop_ids = [
    dictionary.token2id[stopword]
    for stopword in STOPWORDS
    if stopword in dictionary.token2id
]
# Remove id's that occur once 
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
dictionary.filter_tokens(once_ids + stop_ids)
dictionary.compactify()

In [24]:
print(dictionary)

Dictionary(248506 unique tokens: ['#1', '#3:', '&', '(especially', '(in']...)


In [25]:
dictionary.save('./models/dictionary/sample10000Dict')

## Testing Models

In [26]:
# Load in Dataset 
class BoWCorpusJSON: 
    def __init__(self, json_link, column, dictionary): 
        # idx is the index of the row where the text content is
        self.json_link = json_link 
        self.column_name = column
        self.count = 0
        self.dictionary = dictionary
    
    def __len__(self): 
        return self.count
        
    def __iter__(self):  
        with open(self.json_link) as json_file: 
            parser = ijson.items(json_file, 'item')
            for obj in parser:
                self.count += 1
#                 print(obj['url'])
                yield self.dictionary.doc2bow(obj[self.column_name].lower().split())
#         with open(self.csv_link) as csv_file: 
#             reader = csv.reader(csv_file)
#             next(reader, None)
#             for row in reader:
#                 self.count += 1
#                 yield dictionary.doc2bow(row[self.content_idx].lower().split())


## TF-IDF Models

In [27]:
bowcorpus = BoWCorpusJSON(link, text_column, dictionary)

In [28]:
# Save corpus
corpora.MmCorpus.serialize(f'./models/sample10000corpus.mm', bowcorpus)

In [29]:
# Load corpus
loaded_corpus = corpora.MmCorpus('./models/sample10000corpus.mm')

In [30]:
tfidf_model = TfidfModel(bowcorpus, dictionary=dictionary)

In [32]:
next(bowcorpus.__iter__())

[(0, 1),
 (1, 1),
 (2, 2),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 2),
 (12, 2),
 (13, 1),
 (14, 2),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 4),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 1),
 (24, 2),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 1),
 (33, 1),
 (34, 1),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 2),
 (39, 1),
 (40, 1),
 (41, 1),
 (42, 1),
 (43, 2),
 (44, 1),
 (45, 4),
 (46, 1),
 (47, 1),
 (48, 1),
 (49, 2),
 (50, 1),
 (51, 1),
 (52, 1),
 (53, 1),
 (54, 1),
 (55, 2),
 (56, 3),
 (57, 1),
 (58, 1),
 (59, 1),
 (60, 2),
 (61, 3),
 (62, 1),
 (63, 1),
 (64, 1),
 (65, 3),
 (66, 2),
 (67, 1),
 (68, 1),
 (69, 1),
 (70, 1),
 (71, 1),
 (72, 2),
 (73, 1),
 (74, 1),
 (75, 1),
 (76, 1),
 (77, 1),
 (78, 1),
 (79, 1),
 (80, 1),
 (81, 1),
 (82, 1),
 (83, 1),
 (84, 1),
 (85, 1),
 (86, 1),
 (87, 1),
 (88, 1),
 (89, 1),
 (90, 3),
 (91, 5),
 (92, 1),
 (93, 1),
 (94, 1),
 (95, 1),
 (96, 1),
 (97, 1),
 (98, 1),
 (99, 1),
 (100, 1),

In [153]:
tfidf_model[next(bowcorpus.__iter__())]

[(0, 0.008831299994767733),
 (1, 0.005144301458324677),
 (2, 0.009658736321516476),
 (3, 0.01802064487559183),
 (4, 0.007665093992361952),
 (5, 0.041020037740073616),
 (6, 0.043321757661164056),
 (7, 0.016708361061564984),
 (8, 0.07879599366879654),
 (9, 0.01986820939312529),
 (10, 0.05205051161542952),
 (11, 0.013188516257622655),
 (12, 0.013024116204510414),
 (13, 0.1101841551384785),
 (14, 0.05699680645572014),
 (15, 0.012236907357305213),
 (16, 0.017874567063970764),
 (17, 0.020510018870036808),
 (18, 0.01065641273973226),
 (19, 0.010143731985636869),
 (20, 0.017350170538476505),
 (21, 0.016708361061564984),
 (22, 0.009794112738828912),
 (23, 0.01732554082115547),
 (24, 0.03876795106345363),
 (25, 0.08044440393862709),
 (26, 0.06138852729198212),
 (27, 0.012818584653076785),
 (28, 0.022503661199191335),
 (29, 0.022503661199191335),
 (30, 0.04764277420444872),
 (31, 0.017350170538476505),
 (32, 0.03269672917836592),
 (33, 0.03520086843722998),
 (34, 0.005464210621143038),
 (35, 0.00

In [33]:
len(next(bowcorpus.__iter__()))

378

### Convert a sample text to TFIDF document

In [34]:
text = 'Using machine learning, one can find useful patterns from large data sets to make'\
'data more informative and qualitatively insightful. This is very important for'\
'decision making. Students will be exposed to supervised and unsupervised'\
'learning, respectively.'

In [35]:
text_words = simple_preprocess(text)
tfidf_vec = tfidf_model[dictionary.doc2bow(text_words)]
print(tfidf_vec)

[(794, 0.1122266387490081), (1086, 0.09463764980295794), (1187, 0.07365497550707863), (1220, 0.1113823509375516), (1233, 0.07487090242818138), (1951, 0.21078900333676595), (2150, 0.18722171122505066), (2220, 0.1466336559353409), (4668, 0.16497563946531582), (6429, 0.200468321217535), (9551, 0.2424215949208234), (18140, 0.3954273882327571), (20414, 0.3162519078548135), (24372, 0.3312642171623227), (42351, 0.3830167904661377), (67473, 0.463721994369656)]


## LDA Model

In [36]:
lda_model = models.LdaModel(loaded_corpus, num_topics=200)

In [37]:
# See the topic distribution for sample text 
lda_vec = lda_model[dictionary.doc2bow(text_words)]

In [38]:
# Use LDA for similarity index 
lda_index = similarities.Similarity(None, lda_model[bowcorpus], len(dictionary))

## Get similar documents from the corpus

In [39]:
index = similarities.Similarity(None, tfidf_model[bowcorpus], len(dictionary))

In [40]:
res = index[tfidf_vec]

In [41]:
# Print top 10 documents 
print(list(sorted(enumerate(res), key=lambda x: x[1], reverse=True))[:10])

[(8703, 0.18585998), (482, 0.16874214), (3962, 0.12512183), (3302, 0.120331176), (4922, 0.110262394), (2055, 0.10742292), (9064, 0.10512229), (672, 0.09871555), (230, 0.09689501), (3049, 0.09573233)]


In [44]:
# Get the nth value from the corpus to confirm 
import itertools
index = 3302
mycorpus.get_nth(index)
# next(itertools.islice(mycorpus.get_nth(), index, None))

{'url': 'https://medium.com/iotforall/crash-course-in-machine-learning-4f410018b83',
 'title': 'Crash Course in Machine Learning – IoT For All – Medium',
 'author': {'name': None,
  'url': 'https://medium.com/@narin_luangrath',
  'twitter': None},
 'image_url': 'https://cdn-images-1.medium.com/max/1200/0*ljnuMGQ0HQIszT0O.jpg',
 'html_text': 'Narin LuangrathProduct Engineer at Leverege LLCNov 15, 2017Crash Course in Machine\xa0LearningPart I: Supervised Machine\xa0LearningArtificial (Un)intelligenceWhen you type ‘machine learning’ into Google News, the first link you see is a Forbes Magazine piece called “What’s The Difference Between Machine Learning And Artificial Intelligence?” This article contained so many flowery, grandiose descriptions about ML and AI technology that I couldn’t help but laugh. A few notable quotes include:“To get something out of machine learning, you need to know how to code or know someone who does. With artificial intelligence, you get something that takes an 

## Saving Models

In [45]:
# Save dictionary & corpus
dictionary.save('./models/dictionary/sample10000Dict')
corpora.MmCorpus.serialize(f'./models/sample10000corpus.mm', bowcorpus)

In [46]:
# Save the tfidf model 
tfidf_model.save('./models/tfidf-sample10000')

In [None]:
# Save the LDA model 

In [47]:
lda_model.save('./models/lda-sample10000')

In [180]:
lda_model.get_topics()

array([[7.02464313e-05, 6.57749126e-07, 1.33015255e-05, ...,
        6.57749126e-07, 6.57749126e-07, 6.57749126e-07],
       [9.72484122e-05, 1.18829234e-06, 7.61081465e-04, ...,
        1.18829234e-06, 1.18829234e-06, 1.18829234e-06],
       [1.74260465e-03, 5.97335728e-08, 6.22376688e-02, ...,
        5.97335728e-08, 5.97335728e-08, 5.97335728e-08],
       ...,
       [1.37718511e-03, 6.86899781e-09, 6.97344367e-05, ...,
        6.86899781e-09, 6.86899781e-09, 6.86899781e-09],
       [1.71955598e-05, 1.16311571e-06, 3.88967601e-06, ...,
        1.16311571e-06, 1.16311571e-06, 1.16311571e-06],
       [3.36568832e-04, 6.13788700e-07, 1.16003896e-04, ...,
        5.41717270e-07, 5.41717270e-07, 5.41717270e-07]], dtype=float32)

# Generating Recommendations for a User 

## Create a User profile 
* We will use the context of the class 
* Get all liked/added articles
* Create a combinded text of the top 3 and most recent 2