# Paris data preprossesing

In [1]:
import os
import pandas as pd
from pathlib import Path

## Data overview

In [2]:

paris_path = Path("../data/raw/paris")

print("=== COMPREHENSIVE PARIS DATA ANALYSIS ===")
print("=== DIRECTORY STRUCTURE ===")
for root, dirs, files in os.walk(paris_path):
    level = root.replace(str(paris_path), '').count(os.sep)
    indent = ' ' * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = ' ' * 2 * (level + 1)
    for file in files:
        file_size = os.path.getsize(os.path.join(root, file)) / 1024  # Size in KB
        print(f"{subindent}{file} ({file_size:.1f} KB)")

print("\n" + "="*80)
print("=== DATA FILE ANALYSIS ===")

# Analyze all CSV files (both .csv and .CSV)
for csv_file in list(paris_path.rglob("*.csv")) + list(paris_path.rglob("*.CSV")):
    print(f"\n--- {csv_file} ---")
    try:
        # First, detect the file structure
        with open(csv_file, 'r', encoding='utf-8', errors='ignore') as f:
            first_lines = [f.readline().strip() for _ in range(5)]
        
        # Check if it's semicolon-delimited
        is_semicolon = any(';' in line for line in first_lines)
        delimiter = ';' if is_semicolon else ','
        
        print(f"Delimiter: {'semicolon' if is_semicolon else 'comma'}")
        
        # Try reading with appropriate delimiter
        if is_semicolon:
            df = pd.read_csv(csv_file, delimiter=';', nrows=5, encoding='utf-8')
        else:
            df = pd.read_csv(csv_file, nrows=5, encoding='utf-8')
            
        print(f"Shape: {df.shape}")
        print(f"Columns ({len(df.columns)}): {df.columns.tolist()}")
        
        # Show sample data
        if len(df.columns) > 1:  # If properly structured data
            print("First 3 rows:")
            print(df.head(3).to_string())
        else:
            # If single column, show the content structure
            col_name = df.columns[0]
            print(f"Single column content sample:")
            for i, val in enumerate(df[col_name].head(3)):
                print(f"  Row {i}: {str(val)[:100]}...")
                
    except Exception as e:
        print(f"Error reading {csv_file}: {e}")
        # Try to at least show file info
        try:
            file_size = os.path.getsize(csv_file) / 1024 / 1024  # MB
            print(f"File size: {file_size:.2f} MB")
            with open(csv_file, 'r', encoding='utf-8', errors='ignore') as f:
                first_line = f.readline().strip()
                print(f"First line sample: {first_line[:200]}...")
        except:
            print("Could not read basic file info")

print("\n" + "="*80)



=== COMPREHENSIVE PARIS DATA ANALYSIS ===
=== DIRECTORY STRUCTURE ===
paris/
  .DS_Store (10.0 KB)
  .gitkeep (0.0 KB)
  quartier_paris.geojson (346.1 KB)
  household prices/
    .DS_Store (6.0 KB)
    valeursfoncieres-2021.txt.zip (84768.1 KB)
    ValeursFoncieres-2021.txt (612877.7 KB)
  POPULATION/
    .DS_Store (10.0 KB)
    base-ic-evol-struct-pop-2019_csv/
      meta_base-ic-evol-struct-pop-2019.CSV (13857.3 KB)
      base-ic-evol-struct-pop-2019.CSV (47971.2 KB)
    base-ic-evol-struct-pop-2022_csv/
      meta_base-ic-evol-struct-pop-2022.CSV (13851.3 KB)
      base-ic-evol-struct-pop-2022.CSV (50067.9 KB)
    base-ic-evol-struct-pop-2021_csv/
      .DS_Store (6.0 KB)
      base-ic-evol-struct-pop-2021.CSV (54021.5 KB)
      meta_base-ic-evol-struct-pop-2021.CSV (13853.4 KB)
    serie_001760155_24112025/
      characteristics.csv (0.5 KB)
      annual_values.csv (0.4 KB)
    base-ic-evol-struct-pop-2015_csv/
      base-ic-evol-struct-pop-2015.csv (16057.5 KB)
      .DS_Store (6.