__Obiettivo__

Classificare i contenuti estrapolati dai vari paper secondo una lista predefinita di topics.

In [154]:
import os 
import nltk

from dotenv import load_dotenv
from nltk.corpus import stopwords
from huggingface_hub import login

load_dotenv()

login(token=os.getenv("HUGGING_FACE_KEY"))

nltk.download("wordnet")
nltk.download("stopwords")
set_stopwords = set(stopwords.words("english"))

[nltk_data] Downloading package wordnet to /home/matte/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/matte/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [135]:
from typing import List

class Paper():
    def __init__(self, DOI: str, title: str, author: list[str], keyword: List[str], abstract: str, introduction: str):
        self.DOI = DOI
        self.title = title
        self.author = author
        self.keyword = keyword
        self.abstract = abstract
        self.introduction = introduction

    def __str__(self):
        return str(self.DOI) + " " + str(self.title)

In [136]:
import json

from typing import List

with open("../../json/grobid/data.json", "r") as file:
    _json = json.load(file)

list_paper: List[Paper] = []
for _dict in _json:
    for value in _dict.values():
        paper = Paper(value["DOI"], value["Title"], value["Author"], value["Keyword"], value["Abstract"],value["Introduction"])

        list_paper.append(paper)

In [165]:
import pandas

data = {
    "DOI": [paper.DOI for paper in list_paper],
    "Title": [paper.title for paper in list_paper],
    "Keywords": [paper.keyword for paper in list_paper],
    "Abstract": [paper.abstract for paper in list_paper],
    "Introduction": [paper.introduction for paper in list_paper]
}

df_cleaned = pandas.DataFrame(data=data).dropna(subset=["DOI", "Title", "Abstract"])
df_cleaned.head()

Unnamed: 0,DOI,Title,Keywords,Abstract,Introduction
2,10.1007/3-540-44595-1_3,An Intelligent Tutor for a Web-Based Chess Course,"[textbooks, chess, teachers, tutor, tutoring]",Web-based intelligent tutoring systems try to ...,Courses are a common way to organize teaching ...
5,10.3233/icg-1998-21203,Learning To Play Chess Using Temporal Differences,"[td, knightcap, algorithm, internet, chess]","In this paper we present TDLEAF( ), a variatio...","Temporal Difference learning, first introduced..."
7,10.1016/s0020-0255(99)00093-6,Temporal dierence learning for heuristic searc...,"[backgammon, championship, algorithmic, td, ch...",Temporal dierence (TD) learning is a natural m...,"Central to all heuristic search, whether singl..."
8,10.1006/ijhc.1999.0338,Incremental acquisition of search knowledge,"[hampered, intractable, expert, intelligence, ...","Expert systems by de""nition address a particul...","In its early days, arti""cial intelligence (AI)..."
9,10.1016/s0020-0255(99)00097-3,Risk management in game-tree pruning,"[century, strategy, search, algorithm, algorit...",In the half century since minimax was ®rst sug...,The standard approach to game-tree search is t...


In [166]:
from langdetect import detect
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from deep_translator import GoogleTranslator

lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r"[a-zA-Z]+")

def preprocess_field(field: str) -> List[str]:
    try:
        if detect(field) != "en":
            field = GoogleTranslator(source="auto", target="en").translate(field)

        text = field.lower()
        tokens = tokenizer.tokenize(text)

        tokens = [token for token in tokens if len(token) > 2]
        tokens = [token for token in tokens if token not in set_stopwords]

        return [lemmatizer.lemmatize(token) for token in tokens]
    except Exception:
        return [" "]

In [167]:
preprocessed_data = {
    "DOI": [paper.DOI for paper in list_paper],
    "Title": [preprocess_field(paper.title) for paper in list_paper],
    "Keywords": [paper.keyword for paper in list_paper],
    "Abstract": [" ".join(preprocess_field(paper.abstract)) for paper in list_paper],
    "Introduction": [" ".join(preprocess_field(paper.introduction)) for paper in list_paper]
}

df_cleaned_preprocessed = pandas.DataFrame(data=preprocessed_data).dropna(subset=["DOI", "Title", "Abstract"])
df_cleaned_preprocessed.head()

Unnamed: 0,DOI,Title,Keywords,Abstract,Introduction
2,10.1007/3-540-44595-1_3,"[intelligent, tutor, web, based, chess, course]","[textbooks, chess, teachers, tutor, tutoring]",web based intelligent tutoring system try fill...,course common way organize teaching high level...
5,10.3233/icg-1998-21203,"[learning, play, chess, using, temporal, diffe...","[td, knightcap, algorithm, internet, chess]",paper present tdleaf variation algorithm enabl...,temporal difference learning first introduced ...
7,10.1016/s0020-0255(99)00093-6,"[temporal, dierence, learning, heuristic, sear...","[backgammon, championship, algorithmic, td, ch...",temporal dierence learning natural method rein...,central heuristic search whether single agent ...
8,10.1006/ijhc.1999.0338,"[incremental, acquisition, search, knowledge]","[hampered, intractable, expert, intelligence, ...",expert system nition address particular domain...,early day arti cial intelligence strong emphas...
9,10.1016/s0020-0255(99)00097-3,"[risk, management, game, tree, pruning]","[century, strategy, search, algorithm, algorit...",half century since minimax rst suggested strat...,standard approach game tree search use improve...


In [168]:
import re

from typing import Dict

def extract_topics(path: str) -> Dict[str, str]:
    _dict = {}

    with open(path, "r") as _markdown:
        content = _markdown.read()

    labels = re.findall(r"[0-9].\s\**(.*\w)\**", content)

    for i in range(1, len(labels) + 1):
        sublabels = re.findall(fr"{i}.[0-9] \**(.*\w)\**", content)

        _dict[labels[i - 1]] = sublabels

    return _dict

topics = extract_topics("../../markdown/topics.md")

In [169]:
from sentence_transformers import SentenceTransformer

# List of predefined topics to use during the classification
list_zeroshot_topics = sum([item for item in topics.values()], [])

# Precalculating the embeddings
docs = list(df_cleaned.Abstract)
embedding_model = SentenceTransformer("all-mpnet-base-v2")

embeddings = embedding_model.encode(docs)

In [170]:
from bertopic import BERTopic

model = BERTopic(
    zeroshot_topic_list=list_zeroshot_topics,
    zeroshot_min_similarity=.6
)

topics, _ = model.fit_transform(docs, embeddings=embeddings)



In [171]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,293,-1_the_of_to_and,"[the, of, to, and, in, we, that, is, chess, for]",[The point of game tree search is to insulate ...
1,0,53,0_positions_memory_the_of,"[positions, memory, the, of, players, in, that...",[This paper addresses empirically and theoreti...
2,1,48,1_the_to_of_in,"[the, to, of, in, knowledge, is, and, this, an...",[By developing an intelligent computer system ...
3,2,47,2_and_the_to_of,"[and, the, to, of, with, system, is, user, int...",[Although large displays could allow several u...
4,3,38,3_chess_skill_practice_in,"[chess, skill, practice, in, of, and, study, i...",[Although it is widely acknowledged that chess...
5,4,36,4_learning_to_of_reinforcement,"[learning, to, of, reinforcement, games, that,...",[In this paper we present some experiments in ...
6,5,35,5_search_algorithm_trees_alphabeta,"[search, algorithm, trees, alphabeta, tree, th...",[Many enhancements to the alpha-beta algorithm...
7,6,34,6_the_evolutionary_program_genetic,"[the, evolutionary, program, genetic, of, func...","[Here, we propose an evolutionary algorithm (i..."
8,7,30,7_parallel_the_to_on,"[parallel, the, to, on, of, hardware, and, che...",[This paper proposes a distributed and scalabl...
9,8,29,8_kriegspiel_player_games_of,"[kriegspiel, player, games, of, the, we, game,...",[Kriegspiel is a chess variant similar to war ...
