# Importing

In [None]:
import re
import nltk
import spacy
from jupyter_core.version import pattern
from nltk.corpus import stopwords
#from sqlalchemy.testing import not_in
#from textblob import TextBlob
from collections import Counter
from spacy.lang.en.stop_words import contractions
import random
import pandas as pd
import language_tool_python
import pprint

# Load obtained dataset

In [None]:
#filename_all_data_dict = "C:/Users/bsavoiumarinas/Documents/Tesi/PyProjects/Experiments/Files/data_imported_by_pdf_coordinates.csv"
filename_all_data_dict = "./Files/data_imported_by_pdf_coordinates.csv"

data_df = pd.read_csv(filename_all_data_dict, names = ['file', 'text'], header = None)
data_df

In [None]:
# Create a list with all the values in the column 'file'
file_list = data_df['file'].tolist()

# Create a list with all the values in the column 'text'
text_list = data_df['text'].tolist()

# Data preprocessing steps


In [None]:
# Load Italian stopwords from nltk
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('italian'))

# Load SpaCy Italian model
nlp = spacy.load('it_core_news_sm')

### Explore files with different format

In [None]:
# Let's first remove the index from the document
def remove_index(text):
    # Split the text into lines
    lines = text.split("\n")
    
    # Flag to detect if we are in the "Indice" section
    in_index = False
    cleaned_lines = []
    
    for line in lines:
        # Detect the start of the index section by looking for "Indice"
        if re.search(r'\bIndice\b', line, re.IGNORECASE):
            in_index = True
            continue  # Skip the "Indice" line itself
        
        # If we are in the index section, check if the line is part of the index
        if in_index:
            # Check for lines that contain a sentence followed by dots or a number
            if re.match(r'^.*\s*\.*\s*\d+\s*$', line.strip()):
                continue  # Skip this line (it's part of the index)
            # Check for lines that only contain a number (page breaks)
            elif re.match(r'^\d+$', line) or re.match(r'^[\s]*$', line):
                continue  # Skip page numbers or empty lines
            else:
                # If we hit a line that doesn't match the index format, we're past the index section
                in_index = False
        
        # Add the current line to the cleaned_lines if it's not part of the index
        cleaned_lines.append(line)
    
    # Recombine the cleaned lines into a single string
    return "\n".join(cleaned_lines)


In [None]:
def index_dots_removal(text):
    # Split the text into individual lines
    lines = text.splitlines("\n")

    # Define the regex pattern to match lines with more than 10 dots and ending with a digit
    pattern = r"\.{10,}\s*"

    # Filter out lines that match the pattern
    filtered_lines = [line for line in lines if not re.search(pattern, line)]

    # Join the filtered lines back into a single string
    filtered_text = "\n".join(filtered_lines)
    return filtered_text

In [None]:
def remove_remaining_indexes(text):
    pattern = r"(Obiettivi del manuale).*(\n\n\n1\. )"
    
    cleaned_text = re.sub(pattern, ' ', text, flags=re.DOTALL)
    
    return cleaned_text

In [None]:
def remove_table_content(text):
    """
    Removes table-like content from the text, defined as sections starting with 'Es.'
    and containing at least two '|' symbols, which represent manually written tables.
    
    Args:
    - text (str): The input text from which table content needs to be removed.
    
    Returns:
    - str: The modified text with table-like content removed.
    """
    
    # The pattern looks for blocks that begin with 'Es.' and contain at least two '|' symbols.
    pattern = r'Es\..*?(?:\|.*?){2,}.*?(?=\n\n|\Z)'
    
    # Using re.sub to remove matching table-like blocks
    modified_text = re.sub(pattern, '', text, flags=re.DOTALL)
    
    # Also clean up any remaining multiple new lines created after table removal
    modified_text = re.sub(r'\n\s*\n+', '\n\n', modified_text)
    
    return modified_text

In [None]:
def remove_remaining_table_from_text(text):
    # Define the regex pattern
    pattern = r"\n\ntabella[\s\S]*?\|.*\|\s*\n\n"
    
    # Use re.sub to substitute the matched table with an empty string
    cleaned_text = re.sub(pattern, '\n', text, flags=re.MULTILINE)
    
    return cleaned_text

In [None]:
def remove_table_header(text):
    pattern = r"--+|===+"
    
    cleaned_text = re.sub(pattern, '', text, flags=re.MULTILINE)
    return cleaned_text

In [None]:
def flatten_any_list(text):
    # Updated regex pattern to match:
    # 1. Bullet lists (e.g., *, -, •, \uf0a7)
    # 2. Numbered lists (e.g., 1., 2)
    # 3. Uppercase alphabetic markers (e.g., I, U, D)
    # 4. Lowercase alphabetic markers (e.g., a., b.)
    # 5. Mixed patterns (e.g., I Inserimento, 0 Inserimento/Aggiornamento)
    
    
    #list_pattern = r"(\n\s*[\*\-\•\–]|\n\s*\d+[\.\)]|\n\s*[a-zA-Z]{1}|\n\s*\d+|\uf0a7)\s+.+(?:\s+.+)*"
    # 6. Match nested numeration (e.g. 1.1.1., 2.30)
    # 7. Match number or alphabetic lists with ')' (e.g. 1) 2) or a) b) )
    # 8. Mixed number/letter list (e.g. 1a), 1b) or 1a. 1b. ) 
    list_pattern = r"(\n\s*[\*\-\•\–\▪]|\n\s*\d+[\.\)]|\n\s*(\d)*\w[\.\)]|\n\s*[a-zA-Z]{1}|\n\s*\d+|\n\s*(\d+[\.\)]?)+|\\uf0a7)\s+.+(?:\s+.+)*"
    

    # Function to replace the matched list items
    def replace_list_with_commas(match):
        # Get the matched list block
        list_block = match.group(0)
        
        # Remove the list markers and flatten the list
        # flattened_list = re.sub(r"(\n\s*[\*\-\•\–]|\n\s*\d+[\.\)]|\n\s*[a-zA-Z]{1}|\n\s*\d+|\uf0a7)\s+", ", ", list_block)
        flattened_list = re.sub(r"(\n\s*[\*\-\•\–]|\n\s*\d+[\.\)]|\n\s*(\d)*\w[\.\)]|\n\s*[a-zA-Z]{1}|\n\s*\d+|\n\s*(\d+[\.\)]?)+|\\uf0a7)\s+", ", ", list_block)
        
        flattened_list = re.sub(r"\n\s*", " ", flattened_list)  # Removes extra newlines within list items
        
        # Clean up spaces
        flattened_list = flattened_list.replace("  ", " ").strip()
        
        
        return flattened_list

    # Apply the transformation only to list items, leaving other text untouched
    normalized_text = re.sub(list_pattern, replace_list_with_commas, text)
    normalized_text = re.sub(r": ,", ": ", normalized_text)  # Fix for colon-space issues
    
    return normalized_text

In [None]:
def normalize_whitespace(text):
    """
    Reduces multiple consecutive whitespace characters to a single space.

    Args:
    - text (str): The input text with excessive whitespace.

    Returns:
    - str: The text with reduced whitespace.
    """
    # Replace one or more whitespace characters with a single space
    return re.sub(r'\s+', ' ', text).strip()


In [None]:
def remove_issue_date(text):
    # Define the regex pattern to match
    pattern = r'Data emissione (\d+\/){2}(\d+)'
    
    # Remove all matches of the pattern
    cleaned_text = re.sub(pattern, '', text)
    
    return cleaned_text

def remove_issue_slash(text):
    # Define the regex pattern to match
    pattern = r'\/'
    
    # Remove all matches of the pattern
    cleaned_text = re.sub(pattern, ' ', text)
    
    return cleaned_text

In [None]:
# Create some functions to also remove unwanted tables and equations with special characters
# equations = r"(\d).*[=+*\/∑√]{2,}( \d*|\d*)|\d+\/\d+|\^"
# special_elements = r"\uf0e6|\uf0f6|\uf0e7|\uf0f7|\uf0e8|\uf0f8|\uf0e5"
# table_elem = r"([A-Z]{1,3}[\ ,]){3,}|([0-9]{1,3}[\ ,]){3,}"

In [None]:
def clean_text_template(text):
    text = remove_index(text)
    text = index_dots_removal(text)
    text = remove_remaining_indexes(text)
    text = remove_table_content(text)
    text = remove_remaining_table_from_text(text)
    text = remove_table_header(text)
    text = flatten_any_list(text)
    text = remove_issue_date(text)
    text = remove_issue_slash(text)
    text = normalize_whitespace(text)
    return text

Use language_tool_python to normalize the text and correct it grammatically and syntactically, removing misspelling errors and other errors identified by the tool and considered as such by our analysis.

In [None]:
import language_tool_python

# Initialize the LanguageTool objects for both Italian and English
tool_it = language_tool_python.LanguageTool('it-IT')
tool_en = language_tool_python.LanguageTool('en-US')

# Function to check if a word is valid in either Italian or English
def is_valid_word(word):
    matches_it = tool_it.check(word)
    matches_en = tool_en.check(word)
    # If no matches, the word is valid in either language
    return len(matches_it) == 0 or len(matches_en) == 0

# Function to correct errors based on ruleId and specific conditions
def correct_errors(text, matches):
    # Define the rules we want to correct
    valid_rules = {'MORFOLOGIK_RULE_IT_IT', 'WHITESPACE_RULE', 'GR_04_002',
                   'ITALIAN_WORD_REPEATED_RULE', 'ARTICOLATA_SOSTANTIVO', 'UNPAIRED_BRACKETS'}
    
    for match in matches:
        if match.ruleId in valid_rules and len(match.replacements) == 1:
            replacement = match.replacements[0]
            
            # Extract the word that is marked as an error
            incorrect_word = text[match.offset: match.offset + match.errorLength]
            
            # Check if the word is valid in either English or Italian
            if not is_valid_word(incorrect_word):
                # Apply only if MORFOLOGIK_RULE_IT_IT doesn't start with uppercase, others directly
                if match.ruleId != 'MORFOLOGIK_RULE_IT_IT' or not replacement[0].isupper():
                    text = text[:match.offset] + replacement + text[match.offset + match.errorLength:]
    
    return text


In [None]:
# Loop over all text elements in text_list
cleaned_text_list = list(range(len(text_list)))

for i in range(0, len(text_list)):
    # Clean each text element
    text = clean_text_template(text_list[i])
    matches = tool_it.check(text)
    cleaned_text = correct_errors(text, matches)
    cleaned_text_list[i] = cleaned_text

In [None]:
pprint.pprint(cleaned_text_list[0])

Remove some additional patterns, to normalize the text, and discard elements not identified correctly, such as equations, special elements in equations and unwanted pattern brackets.

In [None]:
# Function to lowercase text and remove punctuation
def lowercase_and_remove_punctuation(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    import string
    # Regex to identify any punctuation in the text
    regex = '[' + string.punctuation + ']' #searching for a match with any of the characters inside the square brackets
    result = re.sub(regex,' ',text)
    
    return result

# Apply lowercase and punctuation removal to each cleaned text
cleaned_text_list_to_save = [normalize_whitespace(lowercase_and_remove_punctuation(text)) for text in cleaned_text_list]

In [None]:
import string
print('[' + string.punctuation + ']')


# Save the pre-processed dataset on a csv file

In [None]:
# Create a DataFrame with 'file' and 'text' columns
df = pd.DataFrame(columns=['file', 'text'])

# Gradually add the data to the DataFrame
for i in range(len(file_list)):
    df.loc[i] = [file_list[i], cleaned_text_list_to_save[i]]
    
# Save the DataFrame to a CSV file
df.to_csv('./Files/cleaned_dataset.csv', index=False)

# Output the DataFrame to verify
print(df)

In [None]:
filename_all_data_dict = "./Files/cleaned_dataset.csv"

cleaned_data_df = pd.read_csv(filename_all_data_dict, names = ['file', 'text'], header = None)
cleaned_data_df = cleaned_data_df.drop(index = 0)
cleaned_data_df

In [None]:
# Create a list with all the values in the column 'text'
text_list_preprocessed = cleaned_data_df['text'].tolist()

In [None]:
text_list_preprocessed[0]

# Import the new cleaned dataset and process it once more

In [None]:
def remove_use_case(text):
    # Define the regex pattern to match
    pattern = r'(use case seu).*'
    
    # Remove all matches of the pattern
    cleaned_text = re.sub(pattern, '', text)
    
    return cleaned_text

for i in range(len(text_list_preprocessed)):
    text_list_preprocessed[i] = remove_use_case(text_list_preprocessed[i])


In [None]:
# Before removing any special characters try first to solve contractions
# Find all the contractions in a text
def find_words_with_apostrophe(text):
    """
    This function finds all the words in the provided text that contain the contraction "l'".

    Parameters:
    text (str): The input text from which to extract words.

    Returns:
    list: A list of words containing "l'".
    """
    # Define a regex pattern to match words containing "'"
    pattern =r"\b\w*’\w*\b|\b\w*'\w*\b"
    
    # Use re.findall to get all matches
    matches = re.findall(pattern, text)
    
    return matches

# Example usage
words_with_apostr = []
for text in text_list_preprocessed:
    words_with_apostr.extend(find_words_with_apostrophe(text))  # Use extend instead of append

# Convert to a set to get unique words
unique_words_with_apostr = set(words_with_apostr)

print(unique_words_with_apostr)


In [None]:
# Initial imports
import openai
import os

# We can also set the api_key as environment variable
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

# Set an environment variable
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

client = openai.OpenAI()

def get_completion(prompt, model="gpt-4o-mini"):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    return response.choices[0].message.content

# Convert to a set to get unique words
text = f"""{set(words_with_apostr)}"""

prompt = f"""
For each word given as input 
```{text}``` return the contracted word and the respective extension in italian.
"""
response = get_completion(prompt)

response_text = f'''{response}'''

prompt_rewrite = f"""
Given the '''{response_text}''' in the form **contraction** -- extended contraction  
return it as "contraction" : "extended contraction" 
"""

rewrite = get_completion(prompt_rewrite)

# Regular expression pattern and replacement
pattern = r"^\d+\.\s*\*\*(.*?)\*\*\s*:\s*\*\*(.*?)\*\*"
replacement = r'"\1": "\2",'

# Apply regex to each line
modified_text = re.sub(pattern, replacement, rewrite, flags=re.MULTILINE)

# Output the modified text - which we checked and saved on a txt file in the format 'key':'value', for 'contraction':'extended contraction'
print(modified_text)


In [None]:
# Initialize an empty dictionary
contraction_dictionary = {}

# Read the contents of the file
with open('italian_contractions.txt', 'r', encoding='utf-8') as file:
    # Read each line in the file
    for line in file:
        # Strip any leading/trailing whitespace and trailing commas
        line = line.strip().rstrip(',')
        
        # Split the line into key and value based on the colon
        if ':' in line:
            key, value = line.split(':', 1)  # Split only at the first colon
            # Remove extra quotes and whitespace
            
            key = key.strip().strip('"')
            value = value.strip().strip('"')
            
            # Add to the dictionary
            contraction_dictionary[key] = value

# Print the resulting dictionary
print(len(contraction_dictionary),contraction_dictionary)


In [None]:
# Function to replace contractions
def expand_contractions(text, contraction_dict):
    for contraction, expansion in contraction_dict.items():
        text = text.replace(contraction, expansion)
    return text

# Iterate through each document in text_list and replace contractions
expanded_texts = [expand_contractions(text, contraction_dictionary) for text in text_list_preprocessed]

In [None]:
remaining_words_with_apostr = []
for text in expanded_texts:
    remaining_words_with_apostr.extend(find_words_with_apostrophe(text))  # Use extend instead of append

# Convert to a set to get unique words
remaining_unique_words_with_apostr = set(remaining_words_with_apostr)

print(remaining_unique_words_with_apostr)

In [None]:
def remove_any_special_characters(text):
    regex = r"[^a-zA-Z0-9\s]"
    
    cleaned_text = re.sub(regex, '', text)
    
    return cleaned_text

for i in range(len(expanded_texts)):
    expanded_texts[i] = remove_any_special_characters(expanded_texts[i])
    expanded_texts[i] = normalize_whitespace(expanded_texts[i])
    
text_list_preprocessed[0]


In [None]:
num = random.randint(0, len(text_list_preprocessed) - 1)
text_list_preprocessed[num]

In [None]:
import spacy
from tqdm import tqdm
from spellchecker import SpellChecker

# Load the spaCy Italian model
nlp = spacy.load('it_core_news_sm')

# Initialize the spell checker for the Italian language
spell = SpellChecker(language='it')

# Process each text in the list
for i, text in enumerate(tqdm(text_list_preprocessed, desc = "Checking text: ")):
    print(f"Document {i+1}:")
    
    # Use spaCy to process the text (assuming no punctuation)
    doc = nlp(text)
    
    # Tokenize the text into words (tokens)
    words = [token.text for token in doc]
    
    # Find the misspelled words using pyspellchecker
    misspelled = spell.unknown(words)
    
    # Correct the misspelled words
    for word in misspelled:
        # Get the most likely correction
        correction = spell.correction(word)
        # Get other suggestions (optional)
        suggestions = spell.candidates(word)
        print(f"  Misspelled: {word}, Correction: {correction}, Suggestions: {suggestions}")
    
    print()  # Add a blank line between documents

In [None]:
# Remove other special characters
# re.sub('[^a-zA-Z0-9\\s]', '', text)

# Analyze word frequency and elements present in the resulting text

In [None]:
result = cleaned_text_list_to_save[252]
splitted_text = result.lower().split()
set_text = set(splitted_text)
len(splitted_text), len(set_text), splitted_text, set_text

In [None]:
# Analyze vocabulary
sorted_text = sorted(set_text)
print(sorted_text)

In [None]:
# Remove punctuation before splitting the text
import string
# Regex to identify any punctuation in the text
regex = '[' + string.punctuation + ']' #searching for a match with any of the characters inside the square brackets
print(regex)

In [None]:
no_punctuation_result = re.sub(regex,' ',result)
no_punctuation_result

In [None]:
no_p_splitted_text = no_punctuation_result.lower().split()
set_text_no_p = set(no_p_splitted_text)
sort_no_p = sorted(set_text_no_p)
print(len(sort_no_p)) 
print(sort_no_p)

In [None]:
import nltk
counts = nltk.FreqDist(no_p_splitted_text)
print(counts)

In [None]:
counts.most_common(10)

In [None]:
# Extract the least common words
least_common = counts.most_common()[-20:]

# Print the least common words
least_common

In [None]:
nltk.download('stopwords')
print('Italian stopwords:')
print(stopwords.words('italian'))

# Data preprocessing final operations

### Revise and decide what operations to maintain and in which order

In [None]:
# Function to preprocess Italian text
def preprocess_text(text):
    # 1. Lowercasing
    text = text.lower()
    
    # 2. Expand contractions (e.g., "l’italiana" --> "l' italiana")
    # The contractions handler is not optimal, so in case we can create a custom one
    text = contractions.fix(text, lang='it')  # Handle contractions in Italian
    
    # 3. Handle hyphenated words (split at hyphen)
    text = text.replace('-', ' ')
    
    # 4. Handle words split across lines (concatenate words split with a dash)
    text = re.sub(r'\b(\w+)-\n(\w+)\b', r'\1\2', text)
    
    # 5. Spelling correction (using TextBlob, note: might not be perfect for Italian)
    blob = TextBlob(text)
    text = str(blob.correct())  # Correct spelling, but TextBlob has limited Italian support
    
    return text

In [None]:
# Function to normalize Italian text (abbreviations, acronyms, dates, numbers)
def normalize_text(text):
    # Example dictionary for Italian abbreviations and acronyms
    abbrev_dict = {
        "N.B.": "Nota bene",
        "A.D.E": "Agenzia Delle Entrate"
    }
    
    # Replace abbreviations with full forms
    for abbrev, full_form in abbrev_dict.items():
        text = re.sub(r'\b' + abbrev + r'\b', full_form, text)
    
    # Normalize dates (e.g., 10/05/2022 --> "10 maggio 2022")
    text = re.sub(r'(\d{1,2})/(\d{1,2})/(\d{2,4})', r'\1 \2 \3', text)
    
    # Normalize numbers (optional, depending on your need)
    # text = re.sub(r'\d+', '<NUMERO>', text)
    
    return text

In [None]:
# Tokenization and Post-processing for Italian text
def postprocess_text(text):
    # Tokenize text using SpaCy (Italian language model)
    doc = nlp(text)
    
    # Lemmatization (reduce words to their lemma in Italian)
    lem_text = ' '.join([token.lemma_ for token in doc])
    
    # Remove stopwords (Italian stopwords from NLTK)
    tokens = [token for token in lem_text.split() if token not in stop_words]
    
    # Frequency analysis (optional)
    word_freq = Counter(tokens)
    
    # Remove rare words (words occurring less than 2 times)
    rare_words = [word for word, freq in word_freq.items() if freq < 2]
    final_tokens = [word for word in tokens if word not in rare_words]
    
    return ' '.join(final_tokens)