# Evaluating Recommender Systems for Digital Library Datasets

## Content Based Algorithms
## Comparison/Evaluation based on Metrics/Properties

In [None]:
import pandas as pd
from IPython.display import display, HTML

In [None]:
f_ext = {"Feature Extraction Methods":["TF-IDF", "LSA", "Word2Vec", "Doc2Vec", "BERT", "BoW", "BM25", "LDA", "FastText", "GloVe"]}
sim_m = {"Similarity and Distance Measures":["Cosine Similarity", "Euclidean Distance", "Jaccard Similarity", "Manhattan Distance", "Pearson Correlation", "Bray-Curtis Distance", "Canberra Distance", "Minkowski Distance", "Mahalanobis Distance", "Wasserstein Distance"]}
d_ext = pd.DataFrame(f_ext); d_ext
d_ext.insert(0, "No.", range(1, len(d_ext) + 1))
d_sim = pd.DataFrame(sim_m); d_sim
d_sim.insert(0, "No.", range(1, len(d_sim) + 1))

display(HTML(f"""
<div style="display: flex; justify-content: space-around;">
    <div>{d_ext.to_html(index=False)}</div>
    <div>{d_sim.to_html(index=False)}</div>
</div>
"""))


No.,Feature Extraction Methods
1,TF-IDF
2,LSA
3,Word2Vec
4,Doc2Vec
5,BERT
6,BoW
7,BM25
8,LDA
9,FastText
10,GloVe

No.,Similarity and Distance Measures
1,Cosine Similarity
2,Euclidean Distance
3,Jaccard Similarity
4,Manhattan Distance
5,Pearson Correlation
6,Bray-Curtis Distance
7,Canberra Distance
8,Minkowski Distance
9,Mahalanobis Distance
10,Wasserstein Distance


### Evaluation Metrics/Properties:
- Prediction Accuracy
    - Ratings Prediction Accuracy ? (ratings)
    - **Usage Prediction (feedback)**
    - **Ranking Measures**
- **Coverage**
- **Confidence**
- Trust
- **Novelty**
- Serendipity
- **Diversity**
- Utility
- Risk
- Robustness
- Privacy
- Adaptability
- Scalability


choose 1 book -> run the algorithms for finding the recommended books for that chosen one - 
based on:
- summary
- title
- authors?
- publishers?

### PROCESS JSON FILES -> get the metadata

In [59]:
import json
import os
import pandas as pd

def process_json_to_dataframe(file_path):
    # Load JSON
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    authors = {}
    entryauthor = []
    books = []
    feeds = []
    book_author_mapping = {}
    feed_entry_mapping = {}

    # PROCESS JSON
    for item in data:
        fields = item.get("fields", {})
        model = item.get("model")

        # BOOKS
        if model == "core.entry":
            books.append({
                "id": item["pk"],
                "title": fields.get("title"),
                "summary": fields.get("summary"),
                "identifiers": fields.get("identifiers")
            })

        # AUTHORS
        if model == "core.author":
            authors[item["pk"]] = {
                "name": fields.get("name", ""),
                "surname": fields.get("surname", "")
            }

        # BOOK-AUTHOR
        if model == "core.entryauthor":
            entry_id = fields.get("entry")
            author_id = fields.get("author")
            if entry_id not in book_author_mapping:
                book_author_mapping[entry_id] = []
            book_author_mapping[entry_id].append(author_id)

        # FEEDS
        if model == "core.feed":
            feeds.append({
                "id": item["pk"],
                "title": fields.get("title"),
                "entries": fields.get("entries", [])
            })
            for entry_id in fields.get("entries", []):
                if entry_id not in feed_entry_mapping:
                    feed_entry_mapping[entry_id] = []
                feed_entry_mapping[entry_id].append(fields.get("title"))

    # list of book data with authors and feeds
    book_data = []
    for book in books:
        book_id = book["id"]

        # AUTHORS for BOOKS
        author_ids = book_author_mapping.get(book_id, [])
        author_names = [
            f"{authors[author_id]['name']} {authors[author_id]['surname']}" 
            for author_id in author_ids if author_id in authors
        ]

        # FEEDS for BOOKS
        feed_titles = feed_entry_mapping.get(book_id, [])

        # BOOK DATA
        book_data.append({
            "Title": book["title"],
            "Summary": book["summary"],
            "Identifiers": book["identifiers"],
            "Authors": ", ".join(author_names) if author_names else "No authors",
            "Feeds": ", ".join(feed_titles) if feed_titles else "No feeds"
        })

    print(f"NUM OF BOOKS: {len(books)}\nNUM OF AUTHORS: {len(authors)}\nNUM OF FEED: {len(feeds)}")

    # DATAFRAME
    df = pd.DataFrame(book_data)
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_colwidth', None)
    return df

# PATH to JSON files
current_dir = os.getcwd()
json_path_1 = os.path.abspath(os.path.join(current_dir, "..", "..", "openresearchlibrary", "entities.json"))
json_path_2 = os.path.abspath(os.path.join(current_dir, "..", "..", "mtf", "entities.json"))

# PROCESS JSON files
print("Open Research Library:")
df_opensearch = process_json_to_dataframe(json_path_1)
print("\nMTF:")
df_mtf = process_json_to_dataframe(json_path_2)


# print("Open Research Library DataFrame:")
# df_opensearch.head(3)

# print("\nMTF DataFrame:")
# df_mtf.head(3)


Open Research Library:
NUM OF BOOKS: 403
NUM OF AUTHORS: 823
NUM OF FEED: 22

MTF:
NUM OF BOOKS: 43
NUM OF AUTHORS: 123
NUM OF FEED: 36


### OPEN RESEARCH LIBRARY - ENTITIES.JSON

In [60]:
df_opensearch.head(5)

Unnamed: 0,Title,Summary,Identifiers,Authors,Feeds
0,Digital Kenya : An Entrepreneurial Revolution in The Making,Research & Development; Technology Policy; Development Economics,"{""isbn"": ""urn:isbn:9783319184272""}","Bitange Ndemo, Tim Weiss",OPP
1,Environmental Governance of the Baltic Sea (Volume 10.0),Environmental Management; Water Policy,"{""isbn"": ""urn:isbn:9783319270050""}",No authors,BKS
2,Weißbuch Gelenkersatz : Versorgungssituation Bei Endoprothetischen Hüft- Und Knieoperationen in Deutschland,orthopedics; surgery; medicine; biotechnology,"{""isbn"": ""urn:isbn:9783662529041""}",No authors,DBS
3,Saving For Development : How Latin America and The Caribbean Can Save More and Better,development; economic policy,"{""isbn"": ""urn:isbn:9781349949281""}","Eduardo Cavallo, Inter-American Development Bank, Tomás Serebrisky",PrPr
4,"Informatics in the Future : Proceedings of the 11th European Computer Science Summit (ECSS 2015), Vienna, October 2015","Introduction\n\nThis book is open access under a CC BY-NC 4.0 license.\n\nThis volume discusses the prospects and evolution of informatics (or computer science), which has become the operating system of our world, and is today seen as the science of the information society. Its artifacts change the world and its methods have an impact on how we think about and perceive the world. Classical computer science is built on the notion of an “abstract” machine, which can be instantiated by software to any concrete problem-solving machine, changing its behavior in response to external and internal states, allowing for self-reflective and “intelligent” behavior. However, current phenomena such as the Web, cyber physical systems or the Internet of Things show us that we might already have gone beyond this idea, exemplifying a metamorphosis from a stand-alone calculator to the global operating system of our society.\n\nThus computer scientists will need to reconsider the foundations of their discipline to realize the full potential of our field. Taking often contradictory developments into consideration, researchers will not be able to tackle specific technological or methodological problems in the future without also a broader reflection on their field. The papers in this book take a first step forward and reflect on these issues from different perspectives. The broad spectrum of topics includes\n\nInformatics: a discipline with a (short) history and a high impact\nInterdisciplinarity: how to do research\nEthics: what is our responsibility\nDiversity: why are there so few women in informatics\nCombining informatics, history and art: a special contribution.\nThis book is intended for all informatics researchers, in academia as well as in industry. It is our responsibility – not only as scientists but also as citizens – to make the public aware of the dichotomies and dialectic relationships of computer science.","{""isbn"": ""urn:isbn:9783319557359""}","Frank van Harmelen, Hannes Werthner",DOVI


### MTF - ENTITIES.JSON

In [61]:
df_mtf.head(5)

Unnamed: 0,Title,Summary,Identifiers,Authors,Feeds
0,Odporúčanie pre softvérových inžinierov,"Táto knižka je výsledkom doktorandského seminára, ktorý som viedol v akademickom roku 2014/2015. Na Fakulte informatiky a informačných technológií máme šikovných študentov \nschopných naplniť aj náročné predstavy. Jednou takou predstavou je, aby zo seminára vznikol \nmonotematický výskumný text, ktorý dopracujeme do podoby, pripravenej na tlač. V oblasti \nprogramových a informačných systémov sa takéto štúdie podarilo vydať už niekoľkokrát. Zatiaľ \nčo v prvom zväzku Štúdií sme podchytili seminár venovaný návrhovým vzorom a v druhom seminár venovaný webovej inteligencii, v treťom sa seminár sústreďoval na podstatu softvérovej \narchitektúry a v štvrtom zväzku sme spracovali témy seminára, venovaného softvérovým paradigmám. Zatiaľ posledný, piaty zväzok sa venuje webovede, vznikajúcej vedeckej disciplíne, \nktorá chce študovať web v rôznych aspektoch.\nTento v poradí už šiesty zväzok sa zameriava na odporúčanie v softvérovom inžinierstve. \nMetódy odporúčania informácií sa intenzívne študovali v predošlých zhruba dvadsiatich rokoch \nnajmä v súvislosti s odporúčaním informácií na webe. V posledných rokoch dochádza k čoraz \nintenzívnejšiemu uvedomeniu, že tieto alebo podobné metódy môžu byť užitočné aj \nv softvérovom inžinierstve. Ide najmä o odporúčania, ktoré sa poskytnú softvérovému inžinierovi. Ukázalo sa, že je až prekvapujúco veľa možností, čo by bolo vhodné odporúčať niekomu, kto \nsa podieľa na vývoji softvéru. Druhou stránkou je pestrosť metód, ako odporúčania robiť. \nCelú problematiku som rozdelil do trinástich častí medzi študentov seminára. Východiskovým literárnym zdrojom pre štúdium bol monografický zborník [1]. Po prednesení príspevkov \na diskusii na seminári spracovali autori témy aj písomne. Prvotnú zodpovednosť za kapitoly sme \nsi podelili takto: Blšták za kapitoly 6 a 7, Bystrický za kapitoly 10 a 12, Frťala za kapitoly 9 a \n13, Kaššák za kapitoly 1 a 4 , Konôpka za kapitoly 3 a 11, Laurinec za kapitolu 8 a Lóderer za \nkapitoly 2 a 5. Spomenutý zborník sa ukázal byť v mnohom aj cennou inšpiráciou pri písaní, čo \ns vďakou priznávame. Autori však preštudovali množstvo ďalšej súčasnej vedeckej literatúry \no príslušnej problematike, o čom svedčia aj zoznamy literatúry pripojené na koniec každej kapitoly. Vedomosti z nich získané tiež využili pri písaní textu.","{""doi"": null, ""isbn"": ""978-3-642-45134-8""}","Pavol Návrat, Miroslav Blšták, Michal Bystrický, Tomáš Frťala, Ondrej Kaššák, Martin Konôpka, Peter Laurinec, Marek Lóderer",PSI_B
1,Úvod do matematickej logiky,"Tento učebný text je určený študentom prvého ročníka fakulty informatiky a in- formačných technológií Slovenskej technickej univerzity v Bratislave. Predstavuje spísane prednášky z predmetu matematická logika.\nŠtruktúra tohoto textu je upravená tak, že každá kapitola tvorí jednu prednášku.\nČitateľovi predkladáme 11 kapitol, čo je podľa našich skúsenosti maximálny možný počet prednášok, ktorý sa dá stihnúť’ počas 13-týždňového semestra. V zlých ”rokoch“ sa fyzicky nestihne odprednášať ani 11 prednášok. Vtedy tento učebný´\ntext slúži študentom na samoštúdium, pretože základne vedomosti majú mat’ všetci študenti rovnaké, bez ohľadu na rok, v ktorom študovali.\nPo obsahovej stránke možno učebnicu rozdeliť’ na štyri časti.\n-\tV prvých troch kapitolách zadeﬁnujeme formuly výrokovej logiky, objasníme si ich význam a ukážeme, ako sa odvodzujú. Ukážeme si tiež, že daným spôsobom vieme formulu odvodiť’ práve vtedy, keď je vždy pravdivá.\n-\tKapitoly 4 až 6 sú nadstavbou prvých troch. Opíšeme si ďalšie metódy, pomocou ktorých vieme zistiť’, či je formula tautológia, ako aj aplikácie výrokovej logiky na spínacie a logické obvody a na neurónové siete.\n-\tĎalšie tri kapitoly sa zaoberajú predikátovou logikou. Tak ako pri výrokovej logike, objasníme si význam formúl predikátovej logiky a metódy ich odvodzovania. Na odvodzovanie formúl budeme používať’ sémantické stromy a Gentzenovský (sekventový) kalkulus.\n-\tV posledných dvoch kapitolách sa zaoberáme netradičnými logikami. Opíšeme si základy modálnej a viachodnotovej logiky.","{""doi"": null, ""isbn"": ""978-80-227-4656-4""}",Martin Knor,MA_B
2,Algebra a diskrétna matematika,"Cieľom tejto učebnice je poskytnúť študentom informatiky na Fakulte informatiky a informačných technológií STU ucelený text k prednáške „Algebra \na diskrétna matematika“. Diskrétna matematika patrí medzi teoretické základy \ninformatiky. Slúži nielen pre rozvoj matematicko-logických schopností študentov, ale aj ako teoretická príprava pre ďalšie „pokročilejšie“ informatické predmety. Pri koncipovaní obsahu tejto prednášky stáli sme pred neľahkou úlohou, čo \nzahrnúť do jej obsahu a čo nie. Pretože táto prednáška substituuje čiastočne aj \nbývalý predmet „Lineárna algebra“, zahrnuli sme z tejto oblasti do učebnice \nv rozsahu dvoch prednášok aj základy lineárnej algebry, teórie matíc a sústav \nlineárnych rovníc spolu s elementárnou teóriou determinantov. \nUčebnica je určená pre študentov prvého ročníka bakalárskeho štúdia, ktorí majú \nzákladné stredoškolské vedomosti z teórie množín, algebry a výrokovej logiky. \nV prednáške sme sa snažili čo najviac vyjsť v ústrety potrebám informatiky, preto \naj oproti časti týkajúcej sa algebry je relatívne uprednostnená diskrétna matematika. Cieľom učebnice je aj rozvinúť u študentov schopnosť rigorózneho matematického myslenia pri riešení a formulovaní problémov informatiky. \nPrvá kapitola sa týka metód matematického dôkazu. Kapitoly 2 až 5 sú venované \nteórii množín a kombinatorike, v 6. a 7. kapitole sa venujeme grupám a boolovskej algebre. Kapitoly 8 až 9 sú venované maticiam, sústavám lineárnych rovníc \na determinantom. Zvyšok učebnice sa v 10. až 13. kapitole venuje teórii grafov \na základným algoritmom a aplikáciám teórie grafov. \nKaždá kapitola je sprevádzaná príkladmi, ktorých riešenie poskytne študentom \nschopnosť dobre sa orientovať v danej problematike. Chceme poďakovať mnohým našim študentom, ktorí nám pomohli nájsť veľa nepríjemných preklepov, \nnepresností a evidentných chýb, a tým prispeli k zvýšeniu kvality tejto učebnice. \nTaktiež sa musíme poďakovať nášmu zosnulému kolegovi prof. Ing. Norbertovi \nFrištackému, PhD., s ktorým sme sa často radili pri koncipovaní sylabu prednášky. Na jeho radu sme zaradili do prednášky Quinovu a McCluskeyho metódu \noptimalizácie Boolovej funkcie špecifikujúcej logický obvod. Až pri prednášaní \ntohto predmetu sme zistili, že táto „aplikačná“ časť diskrétnej matematiky patrí \nmedzi študentmi k najobľúbenejšej časti predmetu.","{""doi"": null, ""isbn"": ""9788022729345""}","Vladimír Kvasnička, Jiří Pospíchal",ADM_B
3,Základy digitálnych mien a blockchain sietí,"Tento učebný text je primárne určený ako podporná literatúra pre predmet Digitálne meny<br>a blockchain, no je ho možné z časti použiť aj na predmet Inovácie na finančných trhoch, ktoré<br>sa vyučujú na Fakulte informatiky a informačných technológií Slovenskej technickej univerzite<br>v Bratislave. Čitateľ sa oboznámi so základmi digitálnych a virtuálnych mien ako sú bitcoin,<br>ethereum, polkadot, a iné. Ďalej sa dozvie o blockchain sieťach, ako fungujú decentralizovane<br>a distribuovane, ako ukladajú dáta, ako sa posielajú transakcie a ako sa dosahuje konsenzus , ale<br>nerozoberáme veľmi do hĺbky ako fungujú konsenzuálne algoritmy . Značná časť obsahu je<br>venovaná problematike aplikačných prípad ov použitia blockchain technológie, jej vhodnosti<br>a le okrajovo takisto výzvam, ktorým výskumno vývojové tímy čelili a ich riešeniam Tento<br>materiál nepokrýva všetky oblasti, ktoré sú vyučované v rámci predmetu Digitálne meny a<br>blockchain . Oblasť komunikácie na základe sieťových protokolov a takisto nižšej úrovni<br>kryptogra fie nie sú vôbec v texte spomínané.","{""doi"": null, ""isbn"": ""ISBN 978 80 227 5396 8""}",Kristián Košťál,DMBLOCK_B
4,Quantum Computing for Everyone,Quantum computing is often in the news: China teleported a qubit from earth to a satellite; Shor’s algorithm has put our current encryption methods at risk; quantum key distribution will make encryption safe again;\nGrover’s algorithm will speed up data searches. But what does all this really mean? How does it all work? All of this will be explained.,"{""doi"": ""10.7551/mitpress/11860.001.0001"", ""isbn"": ""9780262350914""}",Chris Bernhardt,No feeds


### Get books metadata Based on ISBN or DOI

In [3]:
pip install isbnlib

Collecting isbnlib
  Using cached isbnlib-3.10.14-py2.py3-none-any.whl.metadata (16 kB)
Using cached isbnlib-3.10.14-py2.py3-none-any.whl (52 kB)
Installing collected packages: isbnlib
Successfully installed isbnlib-3.10.14
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import abc
import json
import urllib.error
from typing import Literal, Optional
from urllib.request import Request, urlopen

from isbnlib import meta, canonical
from isbnlib.registry import bibformatters


class IntrospectionDriver(abc.ABC):
    @abc.abstractmethod
    def resolve(self, identifier: str) -> Optional[dict]:
        pass


class IsbnDriver(IntrospectionDriver):
    def resolve(self, identifier: str) -> Optional[dict]:
        data = meta(canonical(identifier))

        result = {
            "publisher": data.get("Publisher"),
            "doi": None,
            "authors": [],
            "year": data.get("Year"),
            "language": data.get("Language"),
            "bibtex": bibformatters["bibtex"](data),
            "summary": data.get("summary", "Summary not available"),
        }

        for author in data.get("Authors", []):
            bits = author.split(" ")
            result["authors"].append({"name": bits[0], "surname": " ".join(bits[1:])})

        return result


class DoiDriver(IntrospectionDriver):
    def resolve(self, identifier: str) -> Optional[dict]:
        req = Request(
            url=f"https://doi.org/{identifier}",
            headers={"Accept": "application/vnd.citationstyles.csl+json"},
        )

        try:
            res = urlopen(req, timeout=5)
            data = json.loads(res.read().decode("utf-8"))
        except urllib.error.HTTPError | json.JSONDecodeError:
            return None

        result = {
            "publisher": data.get("publisher"),
            "doi": data.get("DOI"),
            "authors": [{"name": i["given"], "surname": i["family"]} for i in data.get("author")],
            "title": data.get("title"),
            "summary": data.get("abstract", "Summary not available"),
        }

        req = Request(
            url=f"https://doi.org/{identifier}",
            headers={"Accept": "application/x-bibtex"},
        )

        try:
            res = urlopen(req, timeout=5)
        except urllib.error.HTTPError:
            return None

        result["bibtex"] = res.read().decode("utf-8")

        return result


class EntryIntrospectionService:
    def __init__(self, driver: Literal["isbn", "dio"]):
        if driver == "isbn":
            self._driver = IsbnDriver()
        elif driver == "doi":
            self._driver = DoiDriver()
        else:
            raise Exception(f"Invalid IntospectionServiceDriver {driver}")

    def resolve(self, identifier: str) -> Optional[dict]:
        return self._driver.resolve(identifier)

### Example Output for Metadata

In [5]:
# DOI
service_doi = EntryIntrospectionService(driver="doi")
data = service_doi.resolve("10.1109/MWC.015.2300467")

# ISBN
service_isbn = EntryIntrospectionService(driver="isbn")
# data = service_isbn.resolve("9783319483627")

def print_metadata(data):
    if isinstance(data, dict):
        for key, value in data.items():
            if isinstance(value, (dict, list)):
                print(f"'{key}':")
                print_metadata(value)
            else:
                print(f"'{key}': '{value}'")
    elif isinstance(data, list):
        for item in data:
            print_metadata(item)
print_metadata(data)

'publisher': 'Institute of Electrical and Electronics Engineers (IEEE)'
'doi': '10.1109/mwc.015.2300467'
'authors':
'name': 'Zhenlong'
'surname': 'Liao'
'name': 'Jian'
'surname': 'Su'
'name': 'Yinghui'
'surname': 'Ye'
'name': 'Rose Qingyang'
'surname': 'Hu'
'title': 'Wisense: A Dataset for WiFi-Based Human Activity Recognition'
'summary': 'Summary not available'
'bibtex': ' @article{Liao_2024, title={Wisense: A Dataset for WiFi-Based Human Activity Recognition}, volume={31}, ISSN={1558-0687}, url={http://dx.doi.org/10.1109/MWC.015.2300467}, DOI={10.1109/mwc.015.2300467}, number={5}, journal={IEEE Wireless Communications}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Liao, Zhenlong and Su, Jian and Ye, Yinghui and Hu, Rose Qingyang}, year={2024}, month=oct, pages={232–237} }
'


# ALGOS -> Opensearch library

### TF-IDF + Cosine/Euclidean/Manhattan (No top 5 limit -> all recommendations -> treshold 0.4)

In [106]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cdist

def tfidf_recommendations(data, book_title, top_n, similarity):
    # Compute TF-IDF matrix
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(data['combined_opensearch']).toarray()  # Convert sparse matrix to dense
    
    # Get the index of the book that matches the title
    idx = data.index[data['Title'] == book_title].tolist()[0]
    
    # COSINE - without top 5 limit
    if similarity == "cosine":  
        cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
        sim_scores = list(enumerate(cosine_sim[idx]))
        
        # Filter out items with 0.0 similarity and exclude the queried book itself
        filtered_scores = [(i, score) for i, score in sim_scores if i != idx and score > 0.4]
    
        # Sort the remaining items based on similarity scores (descending)
        filtered_scores = sorted(filtered_scores, key=lambda x: x[1], reverse=True)
        
        recommendations = [(i, data['Title'].iloc[i], round(score, 3)) for i, score in filtered_scores]

    # EUCLIDEAN
    if similarity == "euclidean":
        euclidean_distances = cdist(tfidf_matrix, tfidf_matrix, metric='euclidean')
        dist_scores = list(enumerate(euclidean_distances[idx]))
        # Sort the books based on distances (ascending, as smaller is more similar)
        dist_scores = sorted(dist_scores, key=lambda x: x[1])
        dist_scores = dist_scores[1:top_n+1]
        recommendations = [(i, data['Title'].iloc[i], round(dist, 3)) for i, dist in dist_scores]

    # MANHATTAN
    if similarity == "manhattan":
        manhattan_distances = cdist(tfidf_matrix, tfidf_matrix, metric='cityblock')  # Cityblock = Manhattan distance
        dist_scores = list(enumerate(manhattan_distances[idx]))
        # Sort the books based on distances (ascending, as smaller is more similar)
        dist_scores = sorted(dist_scores, key=lambda x: x[1])
        dist_scores = dist_scores[1:top_n+1]
        recommendations = [(i, data['Title'].iloc[i], round(dist, 3)) for i, dist in dist_scores]

    return recommendations



## 2. LSA and Cosine Similarity

In [100]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity


def lsa_recommendations(title, top_n):
    # Preprocess the text with TF-IDF
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df_opensearch['combined_opensearch'])

    # Apply LSA (Latent Semantic Analysis)
    lsa = TruncatedSVD(n_components=100, random_state=42)  # Use 100 components for more nuanced analysis
    lsa_matrix = lsa.fit_transform(tfidf_matrix)

    # Compute cosine similarity matrix on the LSA-reduced features
    lsa_cosine_sim = cosine_similarity(lsa_matrix)


    # Find the index of the given title
    idx = df_opensearch.index[df_opensearch['Title'] == title].tolist()[0]
    
    # Compute similarity scores
    sim_scores = list(enumerate(lsa_cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the top_n most similar books
    sim_scores = sim_scores[1:top_n+1]
    book_indices = [i[0] for i in sim_scores]
    
    # Return the recommended titles
    # recommendations = df_opensearch['Title'].iloc[book_indices]
    recommendations = [(i[0], df_opensearch['Title'].iloc[i[0]]) for i in sim_scores]
    return recommendations



### LSA + Cosine (No limit -> all recommendations)

In [29]:
# Function to get all recommendations ranked by similarity using LSA
def get_all_lsa_recommendations_filtered(title, cosine_sim=lsa_cosine_sim, threshold=0.4):
    if title not in df_opensearch['Title'].values:
        return f"Book '{title}' not found in the database."
    
    # Find the index of the given title
    idx = df_opensearch.index[df_opensearch['Title'] == title].tolist()[0]
    
    # Compute similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort all books based on similarity scores (descending)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Exclude the queried book itself and items with similarity <= threshold
    filtered_scores = [(i, score) for i, score in sim_scores if i != idx and score > threshold]
    
    # Get all recommendations with indices, titles, and similarity scores
    recommendations = [(i, df_opensearch['Title'].iloc[i], round(score, 3)) for i, score in filtered_scores]
    
    return recommendations, len(recommendations)

# Example usage
book_title = "Uses of Technology in Upper Secondary Mathematics Education"
all_lsa_recommendations, num_recommended = get_all_lsa_recommendations_filtered(book_title)

# Print the number of recommendations and the recommendations ranked by similarity
print(f"Number of items recommended for '{book_title}' using LSA (filtered for > 0.0 similarity): {num_recommended}")
print("Recommendations:")
for idx, rec_title, similarity in all_lsa_recommendations:
    print(f"{idx}: {rec_title} (Cosine similarity: {similarity})")
print(len(all_lsa_recommendations))


Number of items recommended for 'Uses of Technology in Upper Secondary Mathematics Education' using LSA (filtered for > 0.0 similarity): 21
Recommendations:
58: Uses of Technology in Lower Secondary Mathematics Education : a Concise Topical Survey (Cosine similarity: 0.942)
94: Semiotics in Mathematics Education (Cosine similarity: 0.854)
11: Theories in and of Mathematics Education (Cosine similarity: 0.831)
337: Transitions in Mathematics Education (Cosine similarity: 0.831)
141: Problem Solving in Mathematics Education (Cosine similarity: 0.789)
147: Design Science and Its Importance in the German Mathematics Educational Discussion (Cosine similarity: 0.786)
104: Proceedings of The 13Th international Congress On Mathematical Education : Icme-13 (Cosine similarity: 0.738)
150: The Philosophy of Mathematics Education (Cosine similarity: 0.69)
88: Attitudes, Beliefs, Motivation and Identity in Mathematics Education : An Overview of The Field and Future Directions (Cosine similarity: 0.

## 3. Word2Vec

In [9]:
pip install gensim

Collecting gensimNote: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip



  Downloading gensim-4.3.3-cp311-cp311-win_amd64.whl.metadata (8.2 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Downloading gensim-4.3.3-cp311-cp311-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/24.0 MB 2.7 MB/s eta 0:00:09
    --------------------------------------- 0.4/24.0 MB 5.0 MB/s eta 0:00:05
   - -------------------------------------- 0.6/24.0 MB 5.6 MB/s eta 0:00:05
   - -------------------------------------- 0.8/24.0 MB 5.0 MB/s eta 0:00:05
   - -------------------------------------- 0.9/24.0 MB 4.3 MB/s eta 0:00:06
   - -------------------------------------- 1.1/24.0 MB 4.3 MB/s eta 0:00:06
   -- ------------------------------------- 1.2/24.0 MB 4.1 MB/s eta 0:00:06
   -- ------------------------------------- 1.4/24.0 MB 4.1 MB/s eta 0:00:06
   -- ------------------------------------- 1.6/24.0 MB 4.1 MB/s eta 0:00

In [98]:
from gensim.models import Word2Vec
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Train Word2Vec model
def train_word2vec(data):
    # Tokenize text data
    sentences = [desc.split() for desc in data['combined_opensearch']]
    # Train Word2Vec model
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    return model

# Recommend function using Word2Vec and cosine similarity
def recommend_word2vec(data, book_title, top_n, model):
    def vectorize(text):
        # Convert a text into a vector using Word2Vec
        words = text.split()
        vectors = [model.wv[word] for word in words if word in model.wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)
    
    book_vector = vectorize(data[data['Title'] == book_title]['combined_opensearch'].iloc[0])
    similarities = []
    
    # Calculate cosine similarity for each book
    for i, desc in enumerate(data['combined_opensearch']):
        similarity = cosine_similarity([book_vector], [vectorize(desc)])[0][0]
        similarities.append((i, similarity))
    
    # Sort by similarity scores and get top_n recommendations
    sorted_scores = sorted(similarities, key=lambda x: x[1], reverse=True)
    # recommendations = [data['Title'].iloc[i[0]] for i in sorted_scores[1:top_n+1]]
    recommendations = [(i[0], data['Title'].iloc[i[0]]) for i in sorted_scores[1:top_n+1]]
    return recommendations



## 4. Doc2Vec

In [94]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity

# Train Doc2Vec model
def train_doc2vec(data):
    # Create TaggedDocument objects for training
    documents = [TaggedDocument(desc.split(), [i]) for i, desc in enumerate(data['combined_opensearch'])]
    # Train Doc2Vec model
    model = Doc2Vec(documents, vector_size=100, window=5, min_count=1, workers=4)
    return model

# Recommend function using Doc2Vec
def recommend_doc2vec(data, book_title, top_n, model):
    book_vector = model.infer_vector(data[data['Title'] == book_title]['combined_opensearch'].iloc[0].split())
    similarities = []
    
    # Calculate similarity for each book
    for i, desc in enumerate(data['combined_opensearch']):
        desc_vector = model.infer_vector(desc.split())
        similarity = cosine_similarity([book_vector], [desc_vector])[0][0]
        similarities.append((i, similarity))
    
    # Sort by similarity scores and get top_n recommendations
    sorted_scores = sorted(similarities, key=lambda x: x[1], reverse=True)
    recommendations = [(i[0], data['Title'].iloc[i[0]]) for i in sorted_scores[1:top_n+1]]
    
    return recommendations



## 5. BERT

In [16]:
pip install sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.48.0-py3-none-any.whl.metadata (44 kB)
     ---------------------------------------- 0.0/44.4 kB ? eta -:--:--
     --------- ------------------------------ 10.2/44.4 kB ? eta -:--:--
     ----------------- -------------------- 20.5/44.4 kB 222.6 kB/s eta 0:00:01
     -------------------------------------- 44.4/44.4 kB 313.3 kB/s eta 0:00:00
Collecting tqdm (from sentence-transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
     ---------------------------------------- 0.0/57.7 kB ? eta -:--:--
     ---------------------------------------- 57.7/57.7 kB 3.0 MB/s eta 0:00:00
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.5.1-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Dow


[notice] A new release of pip is available: 23.3.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [90]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def recommend_bert(data, book_title, top_n):
    # Load pre-trained BERT model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Generate embeddings for all descriptions
    embeddings = model.encode(data['combined_opensearch'].tolist(), convert_to_tensor=True)
    
    # Generate embedding for the given book title
    book_embedding = model.encode(data[data['Title'] == book_title]['combined_opensearch'].iloc[0], convert_to_tensor=True)
    
    # Calculate cosine similarities
    similarities = cosine_similarity([book_embedding.cpu().numpy()], embeddings.cpu().numpy())[0]
    
    # Sort by similarity scores and get top_n recommendations
    sorted_scores = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)
    recommendations = [(i[0], data['Title'].iloc[i[0]]) for i in sorted_scores[1:top_n+1]]
    
    return recommendations





## 6. BoW

In [89]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def recommend_bow(data, book_title, top_n):
    # Create a Bag of Words representation
    count_vectorizer = CountVectorizer(stop_words='english')
    bow_matrix = count_vectorizer.fit_transform(data['combined_opensearch'])
    
    # Calculate cosine similarity between books
    cosine_sim = cosine_similarity(bow_matrix, bow_matrix)
    
    # Find the index of the given book
    idx = data[data['Title'] == book_title].index[0]
    
    # Get similarity scores and sort them
    scores = list(enumerate(cosine_sim[idx]))
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    
    # Get top_n recommendations
    recommendations = [(i[0], data['Title'].iloc[i[0]]) for i in sorted_scores[1:top_n+1]]
    
    return recommendations




## 7. BM25

In [20]:
pip install rank-bm25




[notice] A new release of pip is available: 23.3.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [83]:
from rank_bm25 import BM25Okapi

# Recommend function using BM25
def recommend_bm25(data, book_title, top_n):
    # Tokenize the combined text
    tokenized_corpus = [desc.split() for desc in data['combined_opensearch']]
    
    # Initialize BM25
    bm25 = BM25Okapi(tokenized_corpus)
    
    # Tokenize the query (book description of the input title)
    query = data[data['Title'] == book_title]['combined_opensearch'].iloc[0].split()
    
    # Compute BM25 scores for the query
    scores = bm25.get_scores(query)
    
    # Sort by scores and get the top_n recommendations
    sorted_scores = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
    recommendations = [(i[0], data['Title'].iloc[i[0]]) for i in sorted_scores[1:top_n+1]]
    
    return recommendations



## 8. LDA

In [84]:
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Recommend function using LDA
def lda_recommend(data, book_title, top_n, num_topics=10):
    # Preprocessing: Tokenize descriptions
    tokenized_descriptions = [desc.split() for desc in data['combined_opensearch']]
    
    # Create a dictionary and corpus for LDA
    dictionary = Dictionary(tokenized_descriptions)
    corpus = [dictionary.doc2bow(text) for text in tokenized_descriptions]
    
    # Train the LDA model
    lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    
    # Get topic distributions for each document
    topic_distributions = [lda.get_document_topics(bow, minimum_probability=0) for bow in corpus]
    topic_vectors = np.array([[prob for _, prob in dist] for dist in topic_distributions])
    
    # Get the topic vector for the queried book
    book_idx = data[data['Title'] == book_title].index[0]
    book_vector = topic_vectors[book_idx]
    
    # Compute cosine similarity
    similarities = cosine_similarity([book_vector], topic_vectors)[0]
    sorted_indices = np.argsort(similarities)[::-1]
    
    # Get top_n recommendations with indices and titles
    recommendations = [(idx, data['Title'].iloc[idx]) for idx in sorted_indices[1:top_n+1]]
    
    return recommendations




## 9. FastText

In [85]:
from gensim.models import FastText
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Recommend function using FastText
def fasttext_recommend(data, book_title, top_n): 
    # Preprocessing: Tokenize descriptions
    tokenized_descriptions = [desc.split() for desc in data['combined_opensearch']]
    
    # Train the FastText model
    model = FastText(tokenized_descriptions, vector_size=100, window=5, min_count=1, epochs=10)
    
    # Get document vectors by averaging word embeddings
    def get_document_vector(tokens):
        vectors = [model.wv[word] for word in tokens if word in model.wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)
    
    doc_vectors = np.array([get_document_vector(tokens) for tokens in tokenized_descriptions])
    
    # Get the vector for the queried book
    book_idx = data[data['Title'] == book_title].index[0]
    book_vector = doc_vectors[book_idx]
    
    # Compute cosine similarity
    similarities = cosine_similarity([book_vector], doc_vectors)[0]
    sorted_indices = np.argsort(similarities)[::-1]
    
    # Get top_n recommendations with indices and titles
    recommendations = [(idx, data['Title'].iloc[idx]) for idx in sorted_indices[1:top_n+1]]
    
    return recommendations




## 10. GloVe

In [81]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os

# Function to load GloVe embeddings
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vectors = np.array(values[1:], dtype='float32')
            embeddings_index[word] = vectors
    return embeddings_index

# Recommend function using GloVe
def glove_recommend(data, book_title, top_n, embedding_dim=50):
    current_dir = os.getcwd()
    glove_path = os.path.join(current_dir, "..", "..", "glove.6B", "glove.6B.50d.txt")
    glove_path = os.path.abspath(glove_path)
    # Load pre-trained GloVe embeddings
    embeddings_index = load_glove_embeddings(glove_path)
    
    # Preprocessing: Tokenize descriptions
    tokenized_descriptions = [desc.split() for desc in data['combined_opensearch']]
    
    # Get document vectors by averaging GloVe word embeddings
    def get_document_vector(tokens):
        vectors = [embeddings_index[word] for word in tokens if word in embeddings_index]
        # Return the mean vector or a zero vector of embedding_dim
        return np.mean(vectors, axis=0) if vectors else np.zeros(embedding_dim)
    
    # Generate vectors for all documents
    doc_vectors = np.array([get_document_vector(tokens) for tokens in tokenized_descriptions])
    
    # Get the vector for the queried book
    book_idx = data[data['Title'] == book_title].index[0]
    book_vector = doc_vectors[book_idx]
    
    # Compute cosine similarity
    similarities = cosine_similarity([book_vector], doc_vectors)[0]
    sorted_indices = np.argsort(similarities)[::-1]
    
    # Get top_n recommendations with indices and titles
    recommendations = [(idx, data['Title'].iloc[idx], round(similarities[idx], 3)) 
                       for idx in sorted_indices[1:] if similarities[idx] > 0.0][:top_n]
    
    return recommendations



## Example Usage - TEST

In [109]:
book_title = "Uses of Technology in Upper Secondary Mathematics Education"
print(f"BOOK TITLE: {book_title}\n")
top_n = 5

# COMBINE FEATURES (ADD authors and feeds somehow)
df_opensearch['combined_opensearch'] = df_opensearch['Title'] + " " + df_opensearch['Summary']

df_mtf['combined_mtf'] = df_mtf['Title'] + " " + df_mtf['Summary']


# Check if the book title exists in the data
if book_title not in df_opensearch['Title'].values:
    print(f"Book '{book_title}' was not found!")

# PRINT ALL RECOMMENDATIONS
def show_rec(algo, all_rec):
    print(f"Recommendations using: {algo}")
    for rec in all_rec:
        # Check if similarity is provided
        if len(rec) == 3:
            idx, rec_title, similarity = rec
            print(f"{idx}: {rec_title} (Cosine similarity: {similarity})")
        elif len(rec) == 2:
            idx, rec_title = rec
            print(f"{idx}: {rec_title}")
    print(f"Num of RECOMMENDATIONS: {len(all_rec)}\n\n")


# GET RECOMMENDATIONS TOP 5
# ALGOS:

# 1. TF-IDF
similarity = "euclidean"
tfidf_rec = tfidf_recommendations(df_opensearch, book_title, top_n, similarity)
show_rec(f"1. TF-IDF + {similarity}", tfidf_rec)

# ALL OTHER ALGOS USE COSINE NOW (except BM25)
# 2. LSA
lsa_rec = lsa_recommendations(book_title, top_n)
show_rec("2. LSA", lsa_rec)

# 3. Word2Vec
# Generate Word2Vec model
word2vec_model = train_word2vec(df_opensearch)
w2v_rec = recommend_word2vec(df_opensearch, book_title, top_n, word2vec_model)
show_rec("3. Word2Vec", w2v_rec)

# 4. Doc2Vec
# Generate Doc2Vec model
doc2vec_model = train_doc2vec(df_opensearch)
d2v_rec = recommend_doc2vec(df_opensearch, book_title, top_n, doc2vec_model)
show_rec("4. Doc2Vec", d2v_rec)

# 5. BERT
bert_rec = recommend_bert(df_opensearch, book_title, top_n)
show_rec("5. BERT", bert_rec)

# 6. BoW
bow_rec = recommend_bow(df_opensearch, book_title, top_n)
show_rec("6. BoW", bow_rec)

# 7. BM25
bm25_rec = recommend_bm25(df_opensearch, book_title, top_n)
show_rec("7. BM25", bm25_rec)

# 8. LDA
lda_rec = lda_recommend(df_opensearch, book_title, top_n)
show_rec("8. LDA", lda_rec)

# 9. FASTTEXT
fasttext_rec = fasttext_recommend(df_opensearch, book_title, top_n)
show_rec("9. FastText", fasttext_rec)

# 10. GloVe
glove_rec = glove_recommend(df_opensearch, book_title, top_n, embedding_dim=50)
show_rec("10. GloVe", glove_rec)




BOOK TITLE: Uses of Technology in Upper Secondary Mathematics Education

Recommendations using: 1. TF-IDF + euclidean
58: Uses of Technology in Lower Secondary Mathematics Education : a Concise Topical Survey (Cosine similarity: 0.839)
11: Theories in and of Mathematics Education (Cosine similarity: 0.918)
337: Transitions in Mathematics Education (Cosine similarity: 0.939)
94: Semiotics in Mathematics Education (Cosine similarity: 0.967)
9: Research on Teaching and Learning Probability (Cosine similarity: 1.014)
Num of RECOMMENDATIONS: 5


Recommendations using: 2. LSA
58: Uses of Technology in Lower Secondary Mathematics Education : a Concise Topical Survey
94: Semiotics in Mathematics Education
11: Theories in and of Mathematics Education
337: Transitions in Mathematics Education
141: Problem Solving in Mathematics Education
Num of RECOMMENDATIONS: 5


Recommendations using: 3. Word2Vec
11: Theories in and of Mathematics Education
58: Uses of Technology in Lower Secondary Mathematic