__Obiettivo__ 

Classificare i contenuti estrapolati secondo una lista predefinita di topics.

Il vero scopo del notebook consiste nell'analisi della validità dei risultati ottenuti tramite il modello di classificazione BERTopic; a questo scopo sono acquisiti tramite l'API di arXiv alcuni articoli accademici inerenti al _Quantum Computing_, in modo tale che, uniti ad alcuni paper del dominio fino ad ora trattato, possa essere valutata la bontà del modello di classificazione.

In [2]:
import pandas

from nltk.corpus import stopwords

set_stopwords = set(stopwords.words("english"))

In [3]:
class Paper():
    def __init__(self, DOI: str, title: str, abstract: str):
        self.DOI = DOI
        self.title = title
        self.abstract = abstract

    def __str__(self):
        return str(self.DOI) + " " + str(self.title)

In [4]:
new_df = pandas.DataFrame(columns=["DOI", "Title", "Abstract"])

In [5]:
df_preprocessed = pandas.read_json("../../json/dataset/dataset_preprocessed.json")

In [6]:
df_preprocessed.head()

Unnamed: 0,DOI,Title,Keywords,Abstract,Introduction,Text
2,10.1007/3-540-44595-1_3,intelligent tutor web based chess course,"[textbooks, chess, teachers, tutor, tutoring]",web based intelligent tutoring system try fill...,course common way organize teaching high level...,web based intelligent tutoring system try fill...
5,10.3233/icg-1998-21203,learning play chess using temporal difference,"[td, knightcap, algorithm, internet, chess]",paper present tdleaf variation algorithm enabl...,temporal difference learning first introduced ...,paper present tdleaf variation algorithm enabl...
7,10.1016/s0020-0255(99)00093-6,temporal dierence learning heuristic search ga...,"[backgammon, championship, algorithmic, td, ch...",temporal dierence learning natural method rein...,central heuristic search whether single agent ...,temporal dierence learning natural method rein...
8,10.1006/ijhc.1999.0338,incremental acquisition search knowledge,"[hampered, intractable, expert, intelligence, ...",expert system nition address particular domain...,early day arti cial intelligence strong emphas...,expert system nition address particular domain...
9,10.1016/s0020-0255(99)00097-3,,"[century, strategy, search, algorithm, algorit...",half century since minimax rst suggested strat...,standard approach game tree search use improve...,half century since minimax rst suggested strat...


In [7]:
import random
from typing import List

def get_random_chess_papers() -> List[object]:
    _list: List[object] = []
    
    indexes = list(df_preprocessed.index)
    for _ in range(0, 40):
        index = random.choice(indexes)
        _list.append(df_preprocessed.loc[index])

        indexes.remove(index)

    return _list

list_papers_selected = get_random_chess_papers()

list_rows = []
for paper in list_papers_selected:
    list_rows.append({"DOI": paper["DOI"], "Title": paper["Title"], "Abstract": paper["Abstract"]})

In [8]:
new_df = pandas.concat([new_df, pandas.DataFrame(list_rows)])
new_df.shape

(40, 3)

In [10]:
import json
import arxiv

from typing import Dict

def get_arxiv_quantum_papers() -> Dict[str, Paper]:
    arxiv_client = arxiv.Client()

    search = arxiv.Search(query="quantum", max_results=40)  
    results = arxiv_client.results(search)

    _dict: Dict[str, Paper] = {}
    for result in results:
        result.download_pdf("../../articles/articlesBertTest")

        paper = Paper(result.doi, result.title, result.summary)
        _dict[result.get_short_id()] = paper
        
    return _dict

dict_papers_selected = get_arxiv_quantum_papers()

list_rows = []
for key, value in dict_papers_selected.items():
    list_rows.append({"DOI": key, "Title": value.title, "Abstract": value.abstract})

In [11]:
new_df = pandas.concat([new_df, pandas.DataFrame(list_rows)], ignore_index=True)
new_df.shape

(80, 3)

In [12]:
new_df.dropna()

with open("../../json/dataset/dataset_bart_test.json", "w") as file:
    json.dump(new_df.to_dict(), file, indent=3)

In [13]:
new_df.head()

Unnamed: 0,DOI,Title,Abstract
0,10.1016/j.tcs.2005.09.049,admissibility opponent model search,opponent model search come two type risk first...
1,10.2307/30039042,psychometric analysis chess expertise psychome...,study introduces amsterdam chess test act act ...
2,10.1007/11922155_1,,best chess program reached top level player hu...
3,10.24963/ijcai.2024/480,monte carlo continual resolving online strateg...,online game playing algorithm produce high qua...
4,10.1007/bf02454222,role memory concept learning,extent concept memory planning necessary simul...


In [14]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r"[a-zA-Z]+")

def preprocess_list(list_texts: list[str]) -> List[str]:
    try:
        _list: List[str] = [] 
        for item in list_texts:
            text = item.lower()
            tokens = tokenizer.tokenize(text)

            tokens = [token for token in tokens if len(token) > 2]
            tokens = [token for token in tokens if token not in set_stopwords]

            tokens_lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
            
            _list.append(" ".join(tokens_lemmatized) )

        return _list
    except Exception:
        return [" "]

In [15]:
from sentence_transformers import SentenceTransformer

docs = preprocess_list(list(new_df.Abstract))

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(docs)

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
from bertopic import BERTopic

model = BERTopic(
    zeroshot_topic_list=["Computer Chess", "Quantum"],
    zeroshot_min_similarity=.6,
)

_topics, _ = model.fit_transform(docs, embeddings)

In [17]:
# Check if the paper are real divided by category

model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,40,0_chess_game_search_move,"[chess, game, search, move, paper, test, best,...",[opening book important component chess engine...
1,1,40,1_quantum_system_computing_information,"[quantum, system, computing, information, comp...",[quantum computation quantum information great...
