__Obiettivo__

Classificare i contenuti estrapolati dai vari paper secondo una lista predefinita di topics.

In [12]:
import nltk

from dotenv import load_dotenv
from nltk.corpus import stopwords

load_dotenv()

nltk.download("wordnet")
nltk.download("stopwords")
set_stopwords = set(stopwords.words("english"))

[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


In [3]:
from typing import List

class Paper():
    def __init__(self, DOI: str, title: str, author: list[str], keyword: List[str], abstract: str, introduction: str):
        self.DOI = DOI
        self.title = title
        self.author = author
        self.keyword = keyword
        self.abstract = abstract
        self.introduction = introduction

    def __str__(self):
        return str(self.DOI) + " " + str(self.title)

In [4]:
import json

from typing import List

with open("../../json/extraction/metadata_completed.json", "r") as file:
    _json = json.load(file)

list_paper: List[Paper] = []
for _dict in _json:
    for value in _dict.values():
        paper = Paper(value["DOI"], value["Title"], value["Author"], value["Keyword"], value["Abstract"],value["Introduction"])

        list_paper.append(paper)

In [5]:
import pandas

data = {
    "DOI": [paper.DOI for paper in list_paper],
    "Title": [paper.title for paper in list_paper],
    "Keywords": [paper.keyword for paper in list_paper],
    "Abstract": [paper.abstract for paper in list_paper],
    "Introduction": [paper.introduction for paper in list_paper]
}

df_not_cleaned = pandas.DataFrame(data=data).dropna(subset=["DOI", "Title", "Abstract"])
df_not_cleaned.head()

Unnamed: 0,DOI,Title,Keywords,Abstract,Introduction
2,10.1007/3-540-44595-1_3,An Intelligent Tutor for a Web-Based Chess Course,"[textbooks, chess, teachers, tutor, tutoring]",Web-based intelligent tutoring systems try to ...,Courses are a common way to organize teaching ...
5,10.3233/icg-1998-21203,Learning To Play Chess Using Temporal Differences,"[td, knightcap, algorithm, internet, chess]","In this paper we present TDLEAF( ), a variatio...","Temporal Difference learning, first introduced..."
7,10.1016/s0020-0255(99)00093-6,Temporal dierence learning for heuristic searc...,"[backgammon, championship, algorithmic, td, ch...",Temporal dierence (TD) learning is a natural m...,"Central to all heuristic search, whether singl..."
8,10.1006/ijhc.1999.0338,Incremental acquisition of search knowledge,"[hampered, intractable, expert, intelligence, ...","Expert systems by de""nition address a particul...","In its early days, arti""cial intelligence (AI)..."
9,10.1016/s0020-0255(99)00097-3,Risk management in game-tree pruning,"[century, strategy, search, algorithm, algorit...",In the half century since minimax was ®rst sug...,The standard approach to game-tree search is t...


In [6]:
from langdetect import detect
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from deep_translator import GoogleTranslator

lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r"[a-zA-Z]+")

def preprocess_field(field: str) -> List[str]:
    try:
        if detect(field) != "en":
            field = GoogleTranslator(source="auto", target="en").translate(field)

        text = field.lower()
        tokens = tokenizer.tokenize(text)

        tokens = [token for token in tokens if len(token) > 2]
        tokens = [token for token in tokens if token not in set_stopwords]

        return [lemmatizer.lemmatize(token) for token in tokens]
    except Exception:
        return [" "]

In [7]:
preprocessed_data = {
    "DOI": [paper.DOI for paper in list_paper],
    "Title": [" ".join(preprocess_field(paper.title)) for paper in list_paper],
    "Keywords": [paper.keyword for paper in list_paper],
    "Abstract": [" ".join(preprocess_field(paper.abstract)) for paper in list_paper],
    "Introduction": [" ".join(preprocess_field(paper.introduction)) for paper in list_paper]
}

df_cleaned_preprocessed = pandas.DataFrame(data=preprocessed_data).dropna(subset=["DOI", "Title", "Abstract"])
df_cleaned_preprocessed.head()

Unnamed: 0,DOI,Title,Keywords,Abstract,Introduction
2,10.1007/3-540-44595-1_3,intelligent tutor web based chess course,"[textbooks, chess, teachers, tutor, tutoring]",web based intelligent tutoring system try fill...,course common way organize teaching high level...
5,10.3233/icg-1998-21203,learning play chess using temporal difference,"[td, knightcap, algorithm, internet, chess]",paper present tdleaf variation algorithm enabl...,temporal difference learning first introduced ...
7,10.1016/s0020-0255(99)00093-6,temporal dierence learning heuristic search ga...,"[backgammon, championship, algorithmic, td, ch...",temporal dierence learning natural method rein...,central heuristic search whether single agent ...
8,10.1006/ijhc.1999.0338,incremental acquisition search knowledge,"[hampered, intractable, expert, intelligence, ...",expert system nition address particular domain...,early day arti cial intelligence strong emphas...
9,10.1016/s0020-0255(99)00097-3,,"[century, strategy, search, algorithm, algorit...",half century since minimax rst suggested strat...,standard approach game tree search use improve...


In [8]:
df_cleaned_preprocessed["Text"] = df_cleaned_preprocessed["Abstract"] + " " + df_cleaned_preprocessed["Introduction"]
df_cleaned_preprocessed.head()

Unnamed: 0,DOI,Title,Keywords,Abstract,Introduction,Text
2,10.1007/3-540-44595-1_3,intelligent tutor web based chess course,"[textbooks, chess, teachers, tutor, tutoring]",web based intelligent tutoring system try fill...,course common way organize teaching high level...,web based intelligent tutoring system try fill...
5,10.3233/icg-1998-21203,learning play chess using temporal difference,"[td, knightcap, algorithm, internet, chess]",paper present tdleaf variation algorithm enabl...,temporal difference learning first introduced ...,paper present tdleaf variation algorithm enabl...
7,10.1016/s0020-0255(99)00093-6,temporal dierence learning heuristic search ga...,"[backgammon, championship, algorithmic, td, ch...",temporal dierence learning natural method rein...,central heuristic search whether single agent ...,temporal dierence learning natural method rein...
8,10.1006/ijhc.1999.0338,incremental acquisition search knowledge,"[hampered, intractable, expert, intelligence, ...",expert system nition address particular domain...,early day arti cial intelligence strong emphas...,expert system nition address particular domain...
9,10.1016/s0020-0255(99)00097-3,,"[century, strategy, search, algorithm, algorit...",half century since minimax rst suggested strat...,standard approach game tree search use improve...,half century since minimax rst suggested strat...


In [None]:
import re

from typing import Dict

def extract_topics(path: str) -> Dict[str, str]:
    _dict = {}

    with open(path, "r") as _markdown:
        content = _markdown.read()

    labels = re.findall(r"[0-9].\s\**(.*\w)\**", content)

    for i in range(1, len(labels) + 1):
        sublabels = re.findall(fr"{i}.[0-9] \**(.*\w)\**", content)

        _dict[labels[i - 1]] = sublabels

    return _dict

topics = extract_topics("../../markdown/topics.md")

In [10]:
from sentence_transformers import SentenceTransformer

# List of predefined topics to use during the classification
list_zeroshot_topics = sum([item for item in topics.values()], [])

# Precalculating the embeddings
docs = list(df_cleaned_preprocessed.Abstract)
embedding_model = SentenceTransformer("all-mpnet-base-v2")

embeddings = embedding_model.encode(docs)

In [16]:
from bertopic import BERTopic

model = BERTopic(
    zeroshot_topic_list=list_zeroshot_topics,
    zeroshot_min_similarity=.6
)

_topics, _ = model.fit_transform(docs, embeddings=embeddings)

In [17]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,228,-1_the_of_to_and,"[the, of, to, and, in, that, is, for, chess, we]",[In this paper a checkmate control procedure f...
1,0,69,0_search_the_minimax_of,"[search, the, minimax, of, is, tree, algorithm...",[In game-playing programs relying on the minim...
2,1,66,1_learning_to_of_games,"[learning, to, of, games, the, and, in, we, ga...",[In this paper we present some experiments in ...
3,2,53,2_the_of_and_to,"[the, of, and, to, is, chess, players, we, in,...",[Who is the best chess player of all time? Che...
4,3,47,3_and_the_to_of,"[and, the, to, of, with, system, is, user, for...",[Although large displays could allow several u...
5,4,42,4_chess_skill_practice_in,"[chess, skill, practice, in, and, of, performa...",[Although it is widely acknowledged that chess...
6,5,36,5_to_the_of_in,"[to, the, of, in, knowledge, chess, this, is, ...",[By developing an intelligent computer system ...
7,6,33,6_the_evolutionary_program_genetic,"[the, evolutionary, program, genetic, of, func...","[Here, we propose an evolutionary algorithm (i..."
8,7,33,7_parallel_the_to_on,"[parallel, the, to, on, of, hardware, and, fpg...",[In this paper we will describe some of the ba...
9,8,31,8_women_men_in_differences,"[women, men, in, differences, chess, players, ...",[Only 1% of the world's chess grandmasters are...


In [11]:
with open("../../json/topics/topics.json", "w") as file:
    json.dump(topics, file, indent=3)

with open("../../json/dataset/dataset_not_cleaned.json", "w") as file:
    json.dump(df_not_cleaned.to_dict(), file, indent=3)

with open("../../json/dataset/dataset_preprocessed.json", "w") as file:
    json.dump(df_cleaned_preprocessed.to_dict(), file, indent=3)