In [1]:
import os
import re
import csv
import nltk
from nltk.corpus import stopwords

# Scarica le stopwords (una sola volta)
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

# File di input/output
input_csv = './data/used_cars_data.csv'
output_csv = './data/dataset_full_cleaned.csv'
os.makedirs('./data', exist_ok=True)

# Colonne da mantenere (ordine finale)
final_columns = [
    'make_name', 'model_name', 'price', 'year', 'city',
    'daysonmarket', 'horsepower', 'engine_displacement', 'description'
]

def clean_description(text):
    """Pulisce la descrizione rimuovendo punteggiatura e stopwords."""
    text = re.sub(r"[^\w\s]", "", text)
    words = text.lower().split()
    return " ".join([w for w in words if w not in STOPWORDS])

def is_valid_number(value):
    """Controlla se un valore è un numero valido > 0."""
    try:
        return float(value) > 0
    except:
        return False

def clean_row(row):
    """Pulisce una riga e restituisce solo le colonne richieste, se valida."""
    try:
        # Controlli sui campi numerici
        numeric_fields = ['price', 'year', 'daysonmarket', 'horsepower', 'engine_displacement']
        for field in numeric_fields:
            if not is_valid_number(row.get(field, '')):
                return None

        # Pulisce i testi
        for field in ['make_name', 'model_name', 'city']:
            if row.get(field) is None or row[field].strip() == '':
                return None
            row[field] = row[field].replace(',', ' ').replace('\n', ' ').strip()

        # Pulizia descrizione
        desc = row.get('description', '')
        row['description'] = clean_description(desc) if desc else ''

        # Ritorna solo le colonne finali in ordine
        return [row.get(col, '') for col in final_columns]

    except Exception as e:
        return None

def main():
    total_rows = 0
    cleaned_rows = 0

    with open(input_csv, 'r', encoding='utf-8') as infile, open(output_csv, 'w', encoding='utf-8', newline='') as outfile:
        reader = csv.DictReader(infile)
        writer = csv.writer(outfile)
        writer.writerow(final_columns)  # intestazione

        for row in reader:
            total_rows += 1
            cleaned = clean_row(row)
            if cleaned:
                writer.writerow(cleaned)
                cleaned_rows += 1

    print(f"[✓] Pulizia completata: {cleaned_rows}/{total_rows} righe valide salvate in '{output_csv}'.")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[✓] Pulizia completata: 2780756/3000040 righe valide salvate in './data/dataset_full_cleaned.csv'.
