In [None]:
import pandas as pd
import string
import re

def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\s+', ' ', text.strip())  # Remove extra spaces
        text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
        
        # Preserve legal/contract-related terms (example: avoid modifying "art. 15" or "d.lgs 81/08")
        text = re.sub(r'\b(art\.? \d+|d\.lgs \d+/\d+)\b', lambda m: m.group(0).upper(), text)
        
        return text
    return text

def process_dataframe(df, col1='Titolo', col2='descrizione', new_col='Cleaned_Column'):
    """
    Combines two specified columns, cleans the text, and stores it in a new column.
    
    Parameters:
    df (pd.DataFrame): The input dataframe.
    col1 (str): Name of the first column to combine.
    col2 (str): Name of the second column to combine.
    new_col (str): Name of the new column to store the cleaned text.
    
    Returns:
    pd.DataFrame: Dataframe with the new cleaned column.
    """
    df[new_col] = df[col1].astype(str) + ' ' + df[col2].astype(str)  # Combine columns
    df[new_col] = df[new_col].apply(clean_text)  # Apply cleaning function
    return df

def load_and_process_data(file_path, output_path="output.csv"):
    """
    Loads data from a CSV file, processes it by combining and cleaning 'Titolo' and 'descrizione',
    and saves the result to a new CSV file.
    
    Parameters:
    file_path (str): Path to the input CSV file.
    output_path (str): Path to save the processed CSV file (default: 'output.csv').
    
    Returns:
    pd.DataFrame: Processed dataframe.
    """
    df = pd.read_csv(file_path)  # Load CSV file
    df = process_dataframe(df, "Titolo", "descrizione", "Cleaned_Column")  # Process dataframe
    df.to_csv(output_path, index=False)  # Save output CSV
    return df

# Example usage:
df = load_and_process_data("D:\FinetuningBERT-HFT\Partially sampled 2023.csv")
print(df.head())
