# Satarova Begaiym ADM HW3

## Libraries & Setup

In [None]:
import os   

from multiprocess import Pool   
import pandas as pd  
import re 

from collections import defaultdict  
import numpy as np   
import time   
import functools   
import nltk   
import hashlib  
import json
import heapq   
import folium

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from haversine import haversine

import matplotlib.pyplot as plt   

import warnings
warnings.filterwarnings('ignore')

import functions

main_path = os.getcwd()

# 1. Data Collection

## 1.1. *Get the list of places*

I'll collect all the URLs of the places that i need. I need a function that  retrieves the URLs of the "Most popular places" places listed in the first $400$ pages and stores them in a *.txt* file

In [None]:

functions.collect_urls('Places URLs.txt')

## 1.2. *Crawl places*

In [None]:
#store URLs in a list
urls_list = [line.rstrip() for line in open('Places URLs.txt', 'r')]
#for each 0<=index<7200
urls_pairs = [(urls_list[i],i) for i in range(len(urls_list))]
i = 0
while i < len(urls_list):
    try:
        # a pool of 8 worker processes
        with Pool(8) as p:
            #  a batch of 100 HTML files using multiprocessing
            p.map(functions.collect_html_pages, urls_pairs[i:i+100])
        if all([os.path.getsize("HTML_Pages\Page{}\Doc{}.html".format(k//18 + 1, k + 1)) > 1000 for k in range(i, i+100)]):
            i+=100
    except:
        continue

## 1.3. *Parse downloaded pages*

In [None]:
# Parse HTML pages
for i in range(len(urls_list)):
    functions.parse_page(i, urls_list)

# 2. Search Engine

In [None]:
stemmer = nltk.stem.PorterStemmer()
os.chdir(main_path)
path = './TSV_Files'

In [None]:
Desc = functions.pre_process(path, stemmer)

## 2.1. Conjunctive query

In [None]:
# Get all unique words in collections of document descriptions
unique_words = set()
for i in Desc: unique_words = unique_words.union(set(Desc[i].split()))

### 2.1.1. Create your index!

Hashing all the distinct words from the documents

In [None]:
# Creating Vocabulary file

def convertToNumber(mystring):
    hash_object = hashlib.md5(mystring.encode())
    return hash_object.hexdigest()
# creating a vocabulary
vocabulary = {x: convertToNumber(x) for x in unique_words}

os.chdir(main_path)
with open('vocabulary.json', 'w') as f:
    json.dump(vocabulary, f)

In [None]:
# Compute inverted index
inverted_index = functions.build_inv_idx(Desc, vocabulary)

### 2.1.2. Execute the query

In [None]:
# Input query
query = [x.lower() for x in list(map(stemmer.stem, input().split()))]

In [None]:
query

['american', 'museum']

In [None]:
result, docs = functions.searchText(path, query, inverted_index, vocabulary)
result

Unnamed: 0,Title,Description,URL
0,Museum of Mourning Art,Mourning and personal response to death are un...,https://www.atlasobscura.com/places/museum-of-...
1,Madam C.J. Walker Museum & WERD Radio,"This small brick building, located just a few ...",https://www.atlasobscura.com/places/madam-cj-w...
2,Museum of Un-Natural History,"Throughout the ’70s and ’80s, Gerald Matthews ...",https://www.atlasobscura.com/places/museum-of-...
3,Sweet Home Cafe,Thomas Downing was the oyster king. In 19th-ce...,https://www.atlasobscura.com/places/sweet-home...
4,Evel Knievel Museum,The Evel Knievel Museum takes you through the ...,https://www.atlasobscura.com/places/evel-kniev...
...,...,...,...
234,Lake Placid Murals,"Founded in 1992, the Lake Placid Mural Society...",https://www.atlasobscura.com/places/lake-placi...
235,The Hobo Museum,"“Decide your own life, don’t let another perso...",https://www.atlasobscura.com/places/the-hobo-m...
236,Murals of Maxo Vanka,Created in protest against industrial capitali...,https://www.atlasobscura.com/places/murals-of-...
237,Lucky Cat Museum,Probably best known to Americans as the cute p...,https://www.atlasobscura.com/places/lucky-cat-...


## 2.2. Conjunctive query & Ranking score
#Find all the documents that contain all the words in the query.
#Sort them by their similarity with the query.
#Return in output k documents, or all the documents with non-zero similarity with the query when the results are less than k. You must use a heap data structure (you can use Python libraries) for maintaining the top-k documents.

In [None]:
tfidf = TfidfVectorizer()

In [None]:
desc = np.array(list(map(list, Desc.items())))
tfidf_sparse = tfidf.fit_transform(desc[:, 1])
result_dense = tfidf_sparse.todense()

### 2.2.1. Inverted index

In [None]:
important_words = set.intersection(set(vocabulary.keys()), set(tfidf.get_feature_names()))
files = list(desc[:, 0])
names = list(tfidf.get_feature_names())

In [None]:
inverted_index2 = functions.build_inv_idx2(important_words, vocabulary, inverted_index, files, result_dense, names)
inverted_index2

In [None]:
query_vector = tfidf.transform([" ".join(query)])
similarities = cosine_similarity(tfidf_sparse, query_vector).reshape((-1,))

### 2.2.2. Execute the query

In [None]:
# Heap to store and retrieve top-k results
pairs = [(similarities[i], files[i]) for i in range(len(similarities))]
# Filter that contain all words in query
important_pairs = [x for x in pairs if x[1] in docs]
heap, k = [], 20
heapq.heapify(heap)
for el in important_pairs:
    heapq.heappush(heap,el)
    if len(heap)>k:
        heapq.heappop(heap)

In [None]:
result = pd.DataFrame(columns = ['Title', 'Description', 'URL', "Similarity"])
os.chdir(main_path)
os.chdir(path)
for i in range(k):
    pair = heapq.heappop(heap)
    f = open(pair[1], "r", encoding="utf8")
    a = f.read()
    a = re.split(r'\t', a)
    result = result.append({'Title': a[0].strip(),'Description': a[4], 'URL': a[14].strip(), 'Similarity': pair[0]}, ignore_index=True).sort_values(by="Similarity", ascending=False)
    f.close()

In [None]:
result

Unnamed: 0,Title,Description,URL,Similarity
19,Indian Steps Museum,"Constructed by a local lawyer from 1908-1912, ...",https://www.atlasobscura.com/places/indian-ste...,0.303435
0,Museum of the Weird,The dime or dime store museum is by all accoun...,https://www.atlasobscura.com/places/museum-weird,0.279969
1,Sweet Home Cafe,Thomas Downing was the oyster king. In 19th-ce...,https://www.atlasobscura.com/places/sweet-home...,0.26267
2,Harvard Museum of Natural History,Collecting three different institutions into o...,https://www.atlasobscura.com/places/harvard-mu...,0.257088
3,Siriraj Medical Museum,The Siriraj Medical Museum abounds with medica...,https://www.atlasobscura.com/places/siriraj-me...,0.24509
4,Self-Taught Genius Gallery,"In 2017, the American Folk Art Museum in Manha...",https://www.atlasobscura.com/places/self-taugh...,0.240613
5,Milwaukee Art Museum,Like the Guggenheim in New York and the Oaklan...,https://www.atlasobscura.com/places/milwaukee-...,0.22924
6,National World War II Museum,"Perhaps once thought too narrowly focused, thi...",https://www.atlasobscura.com/places/national-w...,0.219917
7,American Writers Museum,The American Writers Museum—tucked away on the...,https://www.atlasobscura.com/places/american-w...,0.219423
8,Geppi's Entertainment Museum,It’s a unique place that can create a sentimen...,https://www.atlasobscura.com/places/geppi-s-en...,0.206554


# 3. Define a new score!

a new metric to rank places based on the queries of their users.

In this scenario, a single user can give input more information than a single textual query, so you need to consider all this information and think of a creative and logical way to answer the user's requests.

Practically:

The user will enter a text query. As a starting point, get the query-related documents by exploiting the search engine of Step 3.1.

Once you have the documents, you need to sort them according to your new score. In this step, you won't have any more to take into account just the plot of the documents; you must use the remaining variables in your dataset (or new possible variables that you can create from the existing ones). You must use a heap data structure (you can use Python libraries) for maintaining the top-k documents.

In [None]:
query = [x.lower() for x in list(map(stemmer.stem, input("Enter the query: ").split()))]
country = input("Enter the country that is liz like to visit: ")

In [None]:
query

['american', 'museum']

In [None]:
country

'united states'

In [None]:
# Execute query and output result
os.chdir(main_path)
result, docs = functions.searchText(path, query, inverted_index, vocabulary)
result

Unnamed: 0,Title,Description,URL
0,Museum of Mourning Art,Mourning and personal response to death are un...,https://www.atlasobscura.com/places/museum-of-...
1,Madam C.J. Walker Museum & WERD Radio,"This small brick building, located just a few ...",https://www.atlasobscura.com/places/madam-cj-w...
2,Museum of Un-Natural History,"Throughout the ’70s and ’80s, Gerald Matthews ...",https://www.atlasobscura.com/places/museum-of-...
3,Sweet Home Cafe,Thomas Downing was the oyster king. In 19th-ce...,https://www.atlasobscura.com/places/sweet-home...
4,Evel Knievel Museum,The Evel Knievel Museum takes you through the ...,https://www.atlasobscura.com/places/evel-kniev...
...,...,...,...
234,Lake Placid Murals,"Founded in 1992, the Lake Placid Mural Society...",https://www.atlasobscura.com/places/lake-placi...
235,The Hobo Museum,"“Decide your own life, don’t let another perso...",https://www.atlasobscura.com/places/the-hobo-m...
236,Murals of Maxo Vanka,Created in protest against industrial capitali...,https://www.atlasobscura.com/places/murals-of-...
237,Lucky Cat Museum,Probably best known to Americans as the cute p...,https://www.atlasobscura.com/places/lucky-cat-...


In [None]:
# Cosine similarity
query_vector_new = tfidf.transform([" ".join(query)])
similarities_new = cosine_similarity(tfidf_sparse, query_vector_new).reshape((-1,))
pairs_new = [(similarities[i], files[i]) for i in range(len(similarities))]
important_pairs_new = [x for x in pairs if x[1] in docs]

In [None]:
# Collect names of places for each file
sim_names = {}
gg = 0
os.chdir(main_path)
os.chdir(path)
for x in pairs_new:
    with open(x[1], 'r', encoding='utf-8') as f:
        a = f.read()
        a = re.split(r'\t+', a)
        name = a[0]
        sim_names[name] = x[0]

In [None]:
# Normalization constants
max_haversine = 20020
max_nearby_places = 3

In [None]:
# Coordinates of all the capital cities in teh world
coords = {}

os.chdir(main_path)
with open("CountryCoords.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        line = line.split(",")
        coords[line[0].lower()]=(float(line[2].strip()),float(line[3].strip()))

coords

{'abkhazia': (43.001525, 41.023415),
 'afghanistan': (34.575503, 69.240073),
 'aland islands': (60.1, 19.933333),
 'albania': (41.327546, 19.818698),
 'algeria': (36.752887, 3.042048),
 'american samoa': (-14.275632, -170.702036),
 'andorra': (42.506317, 1.521835),
 'angola': (-8.839988, 13.289437),
 'anguilla': (18.214813, -63.057441),
 'antarctica': (-90.0, 0.0),
 'antigua and barbuda': (17.12741, -61.846772),
 'argentina': (-34.603684, -58.381559),
 'armenia': (40.179186, 44.499103),
 'aruba': (12.509204, -70.008631),
 'australia': (-35.282, 149.128684),
 'austria': (48.208174, 16.373819),
 'azerbaijan': (40.409262, 49.867092),
 'bahamas': (25.047984, -77.355413),
 'bahrain': (26.228516, 50.58605),
 'bangladesh': (23.810332, 90.412518),
 'barbados': (13.113222, -59.598809),
 'belarus': (53.90454, 27.561524),
 'belgium': (50.85034, 4.35171),
 'belize': (17.251011, -88.75902),
 'benin': (6.496857, 2.628852),
 'bermuda': (32.294816, -64.781375),
 'bhutan': (27.472792, 89.639286),
 'bol

In [None]:
# Custom scoring function
def myscore(x, coords, country):
    os.chdir(main_path)
    os.chdir(path)
    sum_nearby=0
    cos_sim = x[0]
    with open(x[1], 'r', encoding='utf-8') as f:
        try:
            a = f.read()
            a = re.split(r'\t+', a)
            lat, lon = float(a[8]), float(a[9])
            nearby = eval(a[6])
            if isinstance(nearby, list):
                for el in nearby:
                    sum_nearby+=sim_names[el]
            hav = haversine(coords[country], (lat, lon))
        except:
            return 0

    return cos_sim*(hav/max_haversine)*(0.5 + sum_nearby/(2*max_nearby_places))

In [None]:
custom_scores = [(myscore(x, coords, country.lower()), x[1]) for x in important_pairs_new]

In [None]:
# Heap for top-k documents
heap, k = [], 20
heapq.heapify(heap)

for el in custom_scores:
    heapq.heappush(heap,el)
    if len(heap)>k:
        heapq.heappop(heap)

In [None]:
result = pd.DataFrame(columns = ['Title', 'Description', 'URL', "MyScore"])
os.chdir(main_path)
os.chdir(path)
top_k = []
for i in range(k):
    top_k.append(pair[1])
    pair = heapq.heappop(heap)
    f = open(pair[1], "r", encoding="utf8")
    a = f.read()
    a = re.split(r'\t', a)
    result = result.append({'Title': a[0].strip(),'Description': a[4], 'URL': a[14].strip(), 'MyScore': pair[0]}, ignore_index=True).sort_values(by="MyScore", ascending=False)
    f.close()

In [None]:
result

Unnamed: 0,Title,Description,URL,MyScore
19,Museum of Russian Culture,"When you think San Francisco, you probably don...",https://www.atlasobscura.com/places/museum-of-...,0.016274
0,Museum of the Eye,How do you see the world? Find out at the Muse...,https://www.atlasobscura.com/places/museum-oph...,0.015561
1,Batalion Comic Book Museum and Club,Walking into Prague’sBatalion Comic Book Museu...,https://www.atlasobscura.com/places/batalion-c...,0.013009
2,Pianola Museum,"In a city chock-a-block with museums, one of t...",https://www.atlasobscura.com/places/pianola-mu...,0.01234
3,TinkerTown,Ross Ward began carving wood in junior high sc...,https://www.atlasobscura.com/places/tinkertown,0.009245
4,Crazy Horse Memorial,When the carving of Mount Rushmore began in 19...,https://www.atlasobscura.com/places/crazy-hors...,0.008001
5,Glore Psychiatric Museum,"Located in St. Joseph, Missouri, the Glore Psy...",https://www.atlasobscura.com/places/glore-psyc...,0.007826
6,National Atomic Testing Museum,Las Vegas is an oasis in the desert where no o...,https://www.atlasobscura.com/places/national-a...,0.007043
7,Buckhorn Saloon and Museum,Drink a beer whilst you admire stuffed cougars...,https://www.atlasobscura.com/places/buckhorn-s...,0.006922
8,Milwaukee Art Museum,Like the Guggenheim in New York and the Oaklan...,https://www.atlasobscura.com/places/milwaukee-...,0.005839


A hyperparameter can be introduced into the behavior of the new one to balance the weight of each metric in the evaluation function.

# 4. Visualizing the most relevant places

In [None]:
def plot_map(top_k, country):
    data = []
    map = folium.Map(coords[country.lower()], zoom_start=5, height="50%", max_bounds=True, control_scale=True)
    os.chdir(main_path)
    for i in top_k:
        with open(i, "r", encoding="utf8") as f:
            a = f.read()
            a = re.split(r'\t', a)
            name = a[0]
            temp = a[7].split("-")
            city = temp[-2].strip().split(",")[0].strip()
            country = temp[-1].strip()
            address = "-".join(temp[:-2]).strip()
            ppl = a[2]
            lat, lon = float(a[8]), float(a[9])

            html = '''<style type="text/css">
            .tg  {border-collapse:collapse;border-spacing:0;margin:0px auto;}
            .tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
              overflow:hidden;padding:10px 5px;word-break:normal;}
            .tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
              font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
            .tg .tg-3aoq{border-color:inherit;font-family:"Comic Sans MS", cursive, sans-serif !important;font-weight:bold;text-align:center;
              vertical-align:top}
            .tg .tg-am6h{border-color:inherit;font-family:"Comic Sans MS", cursive, sans-serif !important;text-align:center;vertical-align:top}
            </style>
            <head>''' + ''' <center><h3 style='font-family:"Comic Sans MS", cursive, sans-serif !important'>{}</h3>'''.format(name) + '''</center> </head>
            <table class="tg">
            <tbody>
              <tr>
                <td class="tg-3aoq">City</td>
                <td class="tg-am6h">{}</td>
              </tr>
              <tr>
                <td class="tg-3aoq">Country</td>
                <td class="tg-am6h">{}</td>
              </tr>
              <tr>
                <td class="tg-3aoq">Address</td>
                <td class="tg-am6h">{}</td>
              </tr>
              <tr>
                <td class="tg-3aoq">People<br>been here</td>
                <td class="tg-am6h">{}</td>
              </tr>
            </tbody>
            </table>'''.format(city, country, address, ppl)
            popup = folium.Popup(folium.Html(html, script=True), max_width=500)
            folium.Marker(location=[lat, lon], icon=folium.Icon(color="orange", icon="info-sign"), tooltip=name, popup=popup).add_to(map)
    return map

In [None]:
plot_map(top_k, country)