## PreProcessing

In [1]:
import json
import analizer as ana
import numpy as np
from tqdm import tqdm
import random

In [2]:
vocab = set()
docs = set()
frecuency = dict()
tuples = []
with open('myspider_items.jl',encoding='utf') as itemsScrapped:
    for line in itemsScrapped.readlines():
        item = json.loads(line)
        if item['url'] in docs:
            print(item['url'])
            continue
        tuples.append((item['url'],item['body']))
        docs.add(item['url'])
        for word in item['body']:
            if word in vocab: 
                frecuency[word]+=1
                continue
            vocab.add(word)
            frecuency[word] = 1
            

In [3]:
vocab_frecuency = list(frecuency.items())
vocab_frecuency.sort(key=lambda elem: elem[1])
vocab_frecuency.reverse()

In [4]:
vocab_frecuency[0]

('also', 22981)

In [5]:
def write_vocab(vocab:list):
    with open('vocab.txt','w',encoding='utf-8') as vocabfile:
        for w,_ in vocab:
            vocabfile.write(w + '\n')


In [6]:
def write_docs_names(docs):
    with open('docs.txt','w',encoding='utf-8') as docsfile:
        for doc in docs:
            docsfile.write(doc + '\n')

        

In [7]:
write_docs_names(docs)

In [8]:
write_vocab(vocab_frecuency)

In [9]:
del frecuency

In [10]:
document_names = list(docs)
vocabulary = list(vocab)

In [12]:
td_matrix = ana.create_term_document_matrix(tuples,document_names,vocabulary)

100%|██████████| 10644/10644 [00:05<00:00, 2020.13it/s]


In [13]:
index = document_names.index('https://en.wikipedia.org/wiki/Donald_Trump')
for sim_fn in ana.similarity_fns:
    print('\nThe 10 most similar plays to "%s" using %s are:' % (document_names[index], sim_fn.__qualname__))
    ranks = ana.rank_plays(index, td_matrix, sim_fn)
    for idx in range(0, 10):
        doc_id = ranks[idx]
        print('%d: %s' % (idx+1, document_names[doc_id]))

  0%|          | 1/10644 [00:00<20:31,  8.64it/s]


The 10 most similar plays to "https://en.wikipedia.org/wiki/Donald_Trump" using compute_cosine_similarity are:


100%|██████████| 10644/10644 [08:27<00:00, 20.99it/s]
  0%|          | 3/10644 [00:00<08:26, 21.01it/s]

1: https://en.wikipedia.org/wiki/Ileana_Ros-Lehtinen
2: https://en.wikipedia.org/wiki/History_of_Haiti
3: https://en.wikipedia.org/w/index.php?title=History_of_Haiti&printable=yes
4: https://en.wikipedia.org/wiki/Dirty_War
5: https://en.wikipedia.org/wiki/2013_Egyptian_coup_d%27%C3%A9tat
6: https://en.wikipedia.org/wiki/Steve_Knight_(politician)
7: https://en.wikipedia.org/wiki/United_States_House_Committee_on_Foreign_Affairs
8: https://en.wikipedia.org/wiki/American_Imperialism
9: https://en.wikipedia.org/w/index.php?title=American_Imperialism&printable=yes
10: https://en.wikipedia.org/wiki/Impeachment_of_Bill_Clinton

The 10 most similar plays to "https://en.wikipedia.org/wiki/Donald_Trump" using compute_jaccard_similarity are:


100%|██████████| 10644/10644 [08:25<00:00, 21.06it/s]
  0%|          | 3/10644 [00:00<08:08, 21.77it/s]

1: https://en.wikipedia.org/w/index.php?title=Franklin_D._Roosevelt&curid=10979&diff=922811538&oldid=922580449
2: https://en.wikipedia.org/wiki/Mexico
3: https://en.wikipedia.org/wiki/History_of_Haiti
4: https://en.wikipedia.org/w/index.php?title=History_of_Haiti&printable=yes
5: https://en.wikipedia.org/w/index.php?title=Theodore_Roosevelt&curid=30535&diff=922818785&oldid=922811931
6: https://en.wikipedia.org/wiki/Dirty_War
7: https://en.wikipedia.org/w/index.php?title=Montana&oldid=920470632
8: https://en.wikipedia.org/w/index.php?title=Montana&diff=920208135&oldid=920207177
9: https://en.wikipedia.org/w/index.php?title=Montana&printable=yes
10: https://en.wikipedia.org/w/index.php?title=Montana&stableid=921307467

The 10 most similar plays to "https://en.wikipedia.org/wiki/Donald_Trump" using compute_dice_similarity are:


100%|██████████| 10644/10644 [08:49<00:00, 20.10it/s]

1: https://en.wikipedia.org/w/index.php?title=Franklin_D._Roosevelt&curid=10979&diff=922811538&oldid=922580449
2: https://en.wikipedia.org/wiki/Mexico
3: https://en.wikipedia.org/wiki/History_of_Haiti
4: https://en.wikipedia.org/w/index.php?title=History_of_Haiti&printable=yes
5: https://en.wikipedia.org/w/index.php?title=Theodore_Roosevelt&curid=30535&diff=922818785&oldid=922811931
6: https://en.wikipedia.org/wiki/Dirty_War
7: https://en.wikipedia.org/w/index.php?title=Montana&oldid=920470632
8: https://en.wikipedia.org/w/index.php?title=Montana&diff=920208135&oldid=920207177
9: https://en.wikipedia.org/w/index.php?title=Montana&printable=yes
10: https://en.wikipedia.org/w/index.php?title=Montana&stableid=921307467





## Principals functions

In [10]:
import json
import analizer as ana
import numpy as np
from tqdm import tqdm
import random

In [2]:
# para cargar el vocabulario y los nombres de las paginas
# y le quitamos el ultimo caracter q es el \n
def load_item_from_file(file_path):
    result = []
    with open(file_path,encoding='utf-8') as item_file:
        result = item_file.readlines()
    return [w[:-1]for w in result]

In [3]:
# cargamos las tuplas de (url,lista de palabras)
def load_tuples_from_file(tuples_path):
    tuples = []
    with open(tuples_path,encoding='utf-8') as itemsScrapped:
        for line in itemsScrapped.readlines():
            item = json.loads(line)
            tuples.append((item['url'],item['body']))
    return tuples

In [9]:
#llamamos a scrapear con
#'scrapy parse --pipelines --spider=singlePageSpider {url}'
from subprocess import call
def call_for_a_page(url):
    return call(['scrapy','parse','--pipelines','--spider=singlePageSpider',url],timeout=60)

def get_similar_pages(url,document_names,td_matrix,n):
    index = document_names.index(url)
    for sim_fn in ana.similarity_fns:
        print(f'\nThe {n} most similar plays to {document_names[index]} using {sim_fn.__qualname__} are:')
        ranks = ana.rank_plays(index, td_matrix, sim_fn)
        for idx in range(0, n):
            doc_id = ranks[idx]
            print('%d: %s' % (idx+1, document_names[doc_id]))

In [5]:
# escogemos como pagina a comparar
page_to_compare = 'https://en.wikipedia.org/wiki/Cuba'
# mandamos a scrapear la pagina
retcall = call_for_a_page(page_to_compare)

In [6]:
# cargamos los parametros 
vocab = load_item_from_file('vocab.txt')
docs = load_item_from_file('docs.txt')
tuples = load_tuples_from_file('myspider_items.jl')

In [7]:
# cargamos lo scrapeado de la pagina q se quiere comparar
single_tuple = load_tuples_from_file('singlePageSpider_items.jl')
if not single_tuple: print(f"No se pudo scrapear la pagina {page_to_compare}")
else:
    tuples+=single_tuple
    docs.append(page_to_compare)
    td_matrix = ana.create_term_document_matrix(tuples,docs,vocab)
    get_similar_pages(page_to_compare,docs,td_matrix,10)
    

100%|██████████| 10645/10645 [00:05<00:00, 2073.35it/s]
  0%|          | 2/10645 [00:00<12:08, 14.62it/s]


The 10 most similar plays to "https://en.wikipedia.org/wiki/Cuba" using compute_cosine_similarity are:


100%|██████████| 10645/10645 [08:43<00:00, 20.33it/s]
  0%|          | 3/10645 [00:00<08:23, 21.12it/s]

1: https://en.wikipedia.org/wiki/Elections_in_Cuba
2: https://en.wikipedia.org/wiki/American_Imperialism
3: https://en.wikipedia.org/w/index.php?title=American_Imperialism&printable=yes
4: https://en.wikipedia.org/wiki/History_of_Haiti
5: https://en.wikipedia.org/w/index.php?title=History_of_Haiti&printable=yes
6: https://en.wikipedia.org/wiki/Mexico
7: https://en.wikipedia.org/wiki/Micronesia,_Federated_States_of
8: https://en.wikipedia.org/w/index.php?title=Federated_States_of_Micronesia&printable=yes
9: https://en.wikipedia.org/wiki/CAR
10: https://en.wikipedia.org/wiki/Cold_War_(1962%E2%80%931979)

The 10 most similar plays to "https://en.wikipedia.org/wiki/Cuba" using compute_jaccard_similarity are:


100%|██████████| 10645/10645 [08:56<00:00, 19.85it/s]
  0%|          | 2/10645 [00:00<09:29, 18.69it/s]

1: https://en.wikipedia.org/wiki/History_of_Haiti
2: https://en.wikipedia.org/w/index.php?title=History_of_Haiti&printable=yes
3: https://en.wikipedia.org/wiki/Elections_in_Cuba
4: https://en.wikipedia.org/wiki/American_Imperialism
5: https://en.wikipedia.org/w/index.php?title=American_Imperialism&printable=yes
6: https://en.wikipedia.org/wiki/Mexico
7: https://en.wikipedia.org/wiki/Dirty_War
8: https://en.wikipedia.org/wiki/The_Netherlands
9: https://en.wikipedia.org/w/index.php?title=Netherlands&printable=yes
10: https://en.wikipedia.org/wiki/Europe

The 10 most similar plays to "https://en.wikipedia.org/wiki/Cuba" using compute_dice_similarity are:


100%|██████████| 10645/10645 [08:54<00:00, 19.92it/s]

1: https://en.wikipedia.org/wiki/History_of_Haiti
2: https://en.wikipedia.org/w/index.php?title=History_of_Haiti&printable=yes
3: https://en.wikipedia.org/wiki/Elections_in_Cuba
4: https://en.wikipedia.org/wiki/American_Imperialism
5: https://en.wikipedia.org/w/index.php?title=American_Imperialism&printable=yes
6: https://en.wikipedia.org/wiki/Mexico
7: https://en.wikipedia.org/wiki/Dirty_War
8: https://en.wikipedia.org/wiki/The_Netherlands
9: https://en.wikipedia.org/w/index.php?title=Netherlands&printable=yes
10: https://en.wikipedia.org/wiki/Europe



