# Library Imports

We import all the necessary Python libraries used throughout the analysis.


In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk

from transformers import AutoTokenizer


# Dataset Loading


In [2]:
df = pd.read_csv("../Datasets/cleaned_dataset.csv",encoding='utf-8')

# Technical terms

In [3]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

custom_stopwords = {"et", "al", "using", "based", "method", "approach"} 
stop_words = stop_words.union(custom_stopwords)

# Pre-processing function
def preprocess_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]  # remove stopwords
    return ' '.join(tokens)

# Apply preprocessing
df['abstract_clean_processed'] = df['abstract_clean'].astype(str).apply(preprocess_text)

# TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['abstract_clean_processed'])
technical_terms = vectorizer.get_feature_names_out()

# Convert technical terms to DataFrame
df_terms = pd.DataFrame(technical_terms, columns=['technical_term'])

# Save to CSV
df_terms.to_csv('../Datasets/technical_terms.csv', index=False, header=True)

print("Technical terms saved to '../Datasets/technical_terms.csv'")

[nltk_data] Downloading package stopwords to /home/carmine-
[nltk_data]     landolfi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Tokenization of the terms

In [4]:

def tokenize_terms(terms, model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    tokenized_terms = []
    
    for term in terms:
        encoding = tokenizer(term, add_special_tokens=False)
        tokens = tokenizer.tokenize(term)
        token_ids = encoding['input_ids']
        tokenized_terms.append({
            "term": term,
            "tokens": tokens,
            "token_ids": token_ids
        })
    
    df_tokens = pd.DataFrame(tokenized_terms)
    return df_tokens

# Tokenization BERT base
df_bert = tokenize_terms(technical_terms, "bert-base-uncased")
df_bert.to_csv("../Datasets/technical_terms_tokenized_BERT.csv", index=False)
print("Tokenization BERT base stored.")

# Tokenization SciBERT
df_scibert = tokenize_terms(technical_terms, "allenai/scibert_scivocab_uncased")
df_scibert.to_csv("../Datasets/technical_terms_tokenized_SciBERT.csv", index=False)
print("Tokenization SciBERT stored.")

✅ Tokenizzazione BERT base salvata.
✅ Tokenizzazione SciBERT salvata.
