In [1]:
import json
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")


Libraries imported successfully!




In [2]:
# Load all datasets
print("Loading datasets...")

# Load real estate listings
with open('data.json', 'r', encoding='utf-8') as f:
    listings_data = json.load(f)
listings_df = pd.DataFrame(listings_data)

# Load cadastral data
with open('catastro_parcels.json', 'r', encoding='utf-8') as f:
    parcels_data = json.load(f)
parcels_df = pd.DataFrame(parcels_data)

with open('catastro_buildings.json', 'r', encoding='utf-8') as f:
    buildings_data = json.load(f)
buildings_df = pd.DataFrame(buildings_data)

with open('catastro_units.json', 'r', encoding='utf-8') as f:
    units_data = json.load(f)
units_df = pd.DataFrame(units_data)

print(f"✓ Listings: {len(listings_df)} records")
print(f"✓ Parcels: {len(parcels_df)} records")
print(f"✓ Buildings: {len(buildings_df)} records")
print(f"✓ Units: {len(units_df)} records")


Loading datasets...
✓ Listings: 73783 records
✓ Parcels: 10110 records
✓ Buildings: 6495 records
✓ Units: 7924 records


In [3]:
# Explore the data structure
print("=== LISTINGS DATA SAMPLE ===")
print(f"Columns: {list(listings_df.columns)}")
print(f"\nSample record:")
sample_listing = listings_df.iloc[0]
for key in ['listing_id', 'City', 'PropertyType', 'LivingAreaInternal', 'Bedrooms', 'target_amount']:
    if key in sample_listing:
        print(f"  {key}: {sample_listing[key]}")

print(f"\n=== PARCELS DATA SAMPLE ===")
print(f"Columns: {list(parcels_df.columns)}")
sample_parcel = parcels_df.iloc[0]
for key in ['id', 'referencia_catastral', 'municipio', 'superficie_parcela']:
    if key in sample_parcel:
        print(f"  {key}: {sample_parcel[key]}")

print(f"\n=== BUILDINGS DATA SAMPLE ===")
print(f"Columns: {list(buildings_df.columns)}")
sample_building = buildings_df.iloc[0]
for key in ['id', 'parcel_ref', 'building_type', 'built_area', 'municipality']:
    if key in sample_building:
        print(f"  {key}: {sample_building[key]}")

print(f"\n=== UNITS DATA SAMPLE ===")
print(f"Columns: {list(units_df.columns)}")
sample_unit = units_df.iloc[0]
for key in ['id', 'parcel_ref', 'use_type', 'floor_area', 'year_built']:
    if key in sample_unit:
        print(f"  {key}: {sample_unit[key]}")


=== LISTINGS DATA SAMPLE ===
Columns: ['listing_id', 'company_id', 'target_amount', 'listing_created_at', 'days_on_market', 'Country', 'State', 'City', 'Neighbourhood', 'PropertyType', 'BuildingStyle', 'Bedrooms', 'Bathrooms', 'FloorsCount', 'FloorNumber', 'TotalFloors', 'has_elevator', 'has_kitchen', 'KitchenType', 'LivingAreaInternal', 'LivingAreaExternal', 'PlotSize', 'price_per_sqm', 'furnished', 'needs_renovation', 'under_construction', 'ConstructionYear', 'property_age', 'HeatingType', 'has_ac', 'has_solar', 'energy_cert_available', 'has_garden', 'garden_size', 'has_pool', 'pool_size', 'parking_count', 'min_parking_cost', 'terrace_count', 'terrace_total_size', 'amenities_csv', 'amenities_count']

Sample record:
  listing_id: 691
  City: Calvià
  PropertyType: Apartment
  LivingAreaInternal: 77.0
  Bedrooms: 2.0
  target_amount: 895000.0

=== PARCELS DATA SAMPLE ===
Columns: ['id', 'referencia_catastral', 'municipio', 'codigo_municipio', 'provincia', 'codigo_provincia', 'superfici

In [4]:
# Analyze key matching fields
print("=== KEY MATCHING FIELDS ANALYSIS ===")

print(f"\n🏙️ Cities in listings ({listings_df['City'].nunique()} unique):")
city_counts = listings_df['City'].value_counts().head(10)
for city, count in city_counts.items():
    print(f"  {city}: {count} listings")

print(f"\n🏛️ Municipalities in cadastral data ({parcels_df['municipio'].nunique()} unique):")
municipio_counts = parcels_df['municipio'].value_counts().head(10)
for municipio, count in municipio_counts.items():
    print(f"  {municipio}: {count} parcels")

print(f"\n🏠 Property types in listings:")
prop_type_counts = listings_df['PropertyType'].value_counts()
for prop_type, count in prop_type_counts.items():
    print(f"  {prop_type}: {count} listings")

print(f"\n🏡 Use types in cadastral units:")
use_type_counts = units_df['use_type'].value_counts()
for use_type, count in use_type_counts.items():
    print(f"  {use_type}: {count} units")

print(f"\n📐 Living area statistics:")
print(f"  Listings LivingAreaInternal: {listings_df['LivingAreaInternal'].describe()}")
print(f"  Units floor_area: {units_df['floor_area'].describe()}")
print(f"  Buildings built_area: {buildings_df['built_area'].describe()}")


=== KEY MATCHING FIELDS ANALYSIS ===

🏙️ Cities in listings (74 unique):
  Palma de Mallorca: 16232 listings
  Calvià: 11192 listings
  Eivissa: 3620 listings
  Andratx: 3467 listings
  Llucmajor: 3182 listings
  Manacor: 2577 listings
  Santanyí: 2226 listings
  Pollença: 1929 listings
  Ciutadella de Menorca: 1515 listings
  Maó / Mahon: 1510 listings

🏛️ Municipalities in cadastral data (2 unique):
  SELVA: 5131 parcels
  SANTA MARIA DEL CAMI: 4979 parcels

🏠 Property types in listings:
  Villa: 16598 listings
  Apartment: 15804 listings
  Country house: 8132 listings
  Detached house: 5684 listings
  Land: 5232 listings
  Townhouse: 4222 listings
  Penthouse: 4070 listings
  Commercial Space: 2728 listings
  Plot: 2649 listings
  Flat: 2332 listings
  Semi-detached house: 1580 listings
  Terraced house: 1557 listings
  Parking: 471 listings
  Attic Apartment: 405 listings
  Hotel: 358 listings
  Office: 239 listings
  Duplex: 231 listings
  Investment: 230 listings
  Gastronomy: 19

In [5]:
def clean_city_name(city_name):
    """Clean city names for better matching"""
    if pd.isna(city_name):
        return ""
    # Handle encoding issues and normalize
    cleaned = str(city_name).strip()
    # Fix common encoding issues
    cleaned = cleaned.replace('Ã ', 'à').replace('Ã¡', 'á').replace('Ã­', 'í')
    cleaned = cleaned.replace('Ã³', 'ó').replace('Ãº', 'ú').replace('Ã±', 'ñ')
    return cleaned.upper()

def normalize_property_type(prop_type):
    """Normalize property types for matching"""
    if pd.isna(prop_type):
        return ""
    prop_type = str(prop_type).upper()
    
    # Create mapping for common property types
    type_mapping = {
        'APARTMENT': 'RESIDENTIAL',
        'VILLA': 'RESIDENTIAL', 
        'HOUSE': 'RESIDENTIAL',
        'TOWNHOUSE': 'RESIDENTIAL',
        'RESIDENTIAL': 'RESIDENTIAL'
    }
    
    return type_mapping.get(prop_type, prop_type)

def area_similarity_score(area1, area2, tolerance_pct=0.2):
    """Calculate area similarity score with tolerance"""
    if pd.isna(area1) or pd.isna(area2) or area1 <= 0 or area2 <= 0:
        return 0
    
    # Calculate percentage difference
    diff_pct = abs(area1 - area2) / max(area1, area2)
    
    if diff_pct <= tolerance_pct:
        return 1 - (diff_pct / tolerance_pct)  # Score from 1 to 0 within tolerance
    else:
        return 0  # No score if outside tolerance

def geographic_similarity_score(city, municipio):
    """Calculate geographic similarity using fuzzy matching"""
    if pd.isna(city) or pd.isna(municipio):
        return 0
    
    city_clean = clean_city_name(city)
    municipio_clean = clean_city_name(municipio)
    
    # Exact match gets highest score
    if city_clean == municipio_clean:
        return 1.0
    
    # Fuzzy match for partial similarities
    fuzzy_score = fuzz.ratio(city_clean, municipio_clean) / 100.0
    
    # Only consider reasonable matches (>70% similarity)
    return fuzzy_score if fuzzy_score > 0.7 else 0

print("✓ Matching functions defined successfully!")


✓ Matching functions defined successfully!


In [6]:
def create_cadastral_aggregated_data():
    """Create aggregated cadastral data by combining parcels, buildings, and units"""
    
    # Group buildings by parcel
    buildings_agg = buildings_df.groupby('parcel_ref').agg({
        'built_area': 'sum',
        'municipality': 'first',
        'building_type': lambda x: ', '.join(x.unique())
    }).reset_index()
    
    # Group units by parcel
    units_agg = units_df.groupby('parcel_ref').agg({
        'floor_area': 'sum',
        'use_type': lambda x: ', '.join(x.unique()),
        'year_built': 'mean'
    }).reset_index()
    
    # Start with parcels and join with aggregated buildings and units
    cadastral_combined = parcels_df.copy()
    cadastral_combined = cadastral_combined.merge(
        buildings_agg, 
        left_on='referencia_catastral', 
        right_on='parcel_ref', 
        how='left'
    )
    cadastral_combined = cadastral_combined.merge(
        units_agg, 
        left_on='referencia_catastral', 
        right_on='parcel_ref', 
        how='left'
    )
    
    # Create total living area from units and buildings
    cadastral_combined['total_living_area'] = np.where(
        cadastral_combined['floor_area'].notna(),
        cadastral_combined['floor_area'],
        cadastral_combined['built_area']
    )
    
    return cadastral_combined

def match_listing_to_cadastral(listing, cadastral_data, weights=None):
    """Match a single listing to cadastral data with scoring"""
    
    if weights is None:
        weights = {
            'geographic': 0.5,  # City matching is most important
            'area': 0.3,        # Area matching is second most important  
            'property_type': 0.2 # Property type is least important
        }
    
    matches = []
    
    for idx, cadastral_row in cadastral_data.iterrows():
        score_components = {}
        
        # Geographic score
        geo_score = geographic_similarity_score(
            listing['City'], 
            cadastral_row['municipio']
        )
        score_components['geographic'] = geo_score
        
        # Area score
        area_score = area_similarity_score(
            listing['LivingAreaInternal'],
            cadastral_row['total_living_area']
        )
        score_components['area'] = area_score
        
        # Property type score (simplified)
        listing_type_norm = normalize_property_type(listing['PropertyType'])
        cadastral_type_norm = normalize_property_type(cadastral_row.get('use_type', ''))
        type_score = 1.0 if listing_type_norm == cadastral_type_norm else 0.5
        score_components['property_type'] = type_score
        
        # Calculate weighted total score
        total_score = sum(weights[key] * score_components[key] for key in weights.keys())
        
        # Only include matches with reasonable scores
        if total_score > 0.3:  # Minimum threshold
            matches.append({
                'cadastral_id': cadastral_row['id'],
                'referencia_catastral': cadastral_row['referencia_catastral'],
                'municipio': cadastral_row['municipio'],
                'total_living_area': cadastral_row['total_living_area'],
                'total_score': total_score,
                **score_components
            })
    
    # Sort by score and return top matches
    matches.sort(key=lambda x: x['total_score'], reverse=True)
    return matches[:5]  # Return top 5 matches

print("✓ Matching algorithm defined successfully!")


✓ Matching algorithm defined successfully!


In [7]:
# Create aggregated cadastral data
print("Creating aggregated cadastral data...")
cadastral_combined = create_cadastral_aggregated_data()
print(f"✓ Created {len(cadastral_combined)} combined cadastral records")

# Preview the combined data
print(f"\nSample combined cadastral record:")
sample_cadastral = cadastral_combined.iloc[0]
for key in ['id', 'referencia_catastral', 'municipio', 'total_living_area', 'use_type']:
    if key in sample_cadastral:
        print(f"  {key}: {sample_cadastral[key]}")

# Test matching on a few sample listings
print(f"\n=== TESTING MATCHING ON SAMPLE LISTINGS ===")
sample_listings = listings_df.head(3)  # Test on first 3 listings

for idx, listing in sample_listings.iterrows():
    print(f"\n🏠 LISTING {listing['listing_id']}:")
    print(f"  City: {listing['City']}")
    print(f"  Type: {listing['PropertyType']}")
    print(f"  Area: {listing['LivingAreaInternal']} m²")
    print(f"  Price: €{listing['target_amount']:,}")
    
    # Find matches
    matches = match_listing_to_cadastral(listing, cadastral_combined)
    
    if matches:
        print(f"  🎯 Found {len(matches)} potential matches:")
        for i, match in enumerate(matches):
            print(f"    {i+1}. Score: {match['total_score']:.3f} | "
                  f"Municipality: {match['municipio']} | "
                  f"Area: {match['total_living_area']} m² | "
                  f"Ref: {match['referencia_catastral']}")
    else:
        print(f"  ❌ No suitable matches found")


Creating aggregated cadastral data...
✓ Created 10110 combined cadastral records

Sample combined cadastral record:
  id: 2872
  referencia_catastral: 000100100DD99E
  municipio: SELVA
  total_living_area: 179.0
  use_type: Residencial

=== TESTING MATCHING ON SAMPLE LISTINGS ===

🏠 LISTING 691:
  City: Calvià
  Type: Apartment
  Area: 77.0 m²
  Price: €895,000.0
  🎯 Found 5 potential matches:
    1. Score: 0.400 | Municipality: SANTA MARIA DEL CAMI | Area: 77.0 m² | Ref: 000504800DD78H
    2. Score: 0.400 | Municipality: SANTA MARIA DEL CAMI | Area: 77.0 m² | Ref: 001003000DD78H
    3. Score: 0.400 | Municipality: SANTA MARIA DEL CAMI | Area: 77.0 m² | Ref: 07056A00100157
    4. Score: 0.400 | Municipality: SANTA MARIA DEL CAMI | Area: 77.0 m² | Ref: 07056A00500306
    5. Score: 0.400 | Municipality: SANTA MARIA DEL CAMI | Area: 77.0 m² | Ref: 07056A00500538

🏠 LISTING 692:
  City: Andratx
  Type: Villa
  Area: 321.0 m²
  Price: €5,900,000.0
  🎯 Found 5 potential matches:
    1. Score

In [8]:
# Match all listings (with progress tracking for large datasets)
print("🚀 Starting full dataset matching...")
print("Note: For large datasets, this may take several minutes")

all_matches = []
matched_count = 0
total_listings = len(listings_df)

# For demonstration, limit to first 50 listings to avoid long processing times
# Remove this limit for full processing: listings_df instead of listings_df.head(50)
processing_listings = listings_df.head(50)
print(f"Processing {len(processing_listings)} listings (limited for demo)")

for idx, listing in processing_listings.iterrows():
    if (idx + 1) % 10 == 0:  # Progress update every 10 listings
        print(f"  Processed {idx + 1}/{len(processing_listings)} listings...")
    
    matches = match_listing_to_cadastral(listing, cadastral_combined)
    
    if matches:
        matched_count += 1
        # Store the best match for each listing
        best_match = matches[0]
        all_matches.append({
            'listing_id': listing['listing_id'],
            'listing_city': listing['City'],
            'listing_type': listing['PropertyType'],
            'listing_area': listing['LivingAreaInternal'],
            'listing_price': listing['target_amount'],
            'match_score': best_match['total_score'],
            'cadastral_id': best_match['cadastral_id'],
            'cadastral_ref': best_match['referencia_catastral'],
            'cadastral_municipio': best_match['municipio'],
            'cadastral_area': best_match['total_living_area'],
            'geo_score': best_match['geographic'],
            'area_score': best_match['area'],
            'type_score': best_match['property_type']
        })

print(f"\n✅ Matching completed!")
print(f"📊 Results: {matched_count}/{len(processing_listings)} listings matched ({matched_count/len(processing_listings)*100:.1f}%)")

# Convert to DataFrame for analysis
matches_df = pd.DataFrame(all_matches)
if len(matches_df) > 0:
    print(f"📈 Average match score: {matches_df['match_score'].mean():.3f}")
    print(f"🎯 High-confidence matches (score > 0.7): {(matches_df['match_score'] > 0.7).sum()}")
else:
    print("❌ No matches found")


🚀 Starting full dataset matching...
Note: For large datasets, this may take several minutes
Processing 50 listings (limited for demo)
  Processed 10/50 listings...
  Processed 20/50 listings...
  Processed 30/50 listings...
  Processed 40/50 listings...
  Processed 50/50 listings...

✅ Matching completed!
📊 Results: 43/50 listings matched (86.0%)
📈 Average match score: 0.397
🎯 High-confidence matches (score > 0.7): 0


In [None]:
# Detailed analysis of results
if len(matches_df) > 0:
    print("=== DETAILED RESULTS ANALYSIS ===")
    
    # Display sample matches
    print(f"\n🔍 Sample of best matches:")
    top_matches = matches_df.nlargest(5, 'match_score')
    for idx, match in top_matches.iterrows():
        print(f"\n  Listing {match['listing_id']} → Cadastral {match['cadastral_ref']}")
        print(f"    📍 {match['listing_city']} → {match['cadastral_municipio']}")
        print(f"    📐 {match['listing_area']}m² → {match['cadastral_area']}m²")
        print(f"    🎯 Score: {match['match_score']:.3f} (geo:{match['geo_score']:.2f}, area:{match['area_score']:.2f}, type:{match['type_score']:.2f})")
    
    # Score distribution analysis
    print(f"\n📊 Score Distribution:")
    score_ranges = [
        (0.9, 1.0, "Excellent (0.9-1.0)"),
        (0.7, 0.9, "Good (0.7-0.9)"),
        (0.5, 0.7, "Fair (0.5-0.7)"),
        (0.3, 0.5, "Poor (0.3-0.5)")
    ]
    
    for min_score, max_score, label in score_ranges:
        count = ((matches_df['match_score'] >= min_score) & (matches_df['match_score'] < max_score)).sum()
        if min_score == 0.9:  # Handle the top range inclusive
            count = (matches_df['match_score'] >= min_score).sum()
        print(f"  {label}: {count} matches ({count/len(matches_df)*100:.1f}%)")
    
    # Geographic matching analysis
    print(f"\n🗺️ Geographic Matching:")
    perfect_geo = (matches_df['geo_score'] == 1.0).sum()
    partial_geo = ((matches_df['geo_score'] > 0) & (matches_df['geo_score'] < 1.0)).sum()
    print(f"  Perfect city matches: {perfect_geo} ({perfect_geo/len(matches_df)*100:.1f}%)")
    print(f"  Partial city matches: {partial_geo} ({partial_geo/len(matches_df)*100:.1f}%)")
    
    # Area matching analysis  
    print(f"\n📐 Area Matching:")
    good_area = (matches_df['area_score'] > 0.8).sum()
    fair_area = ((matches_df['area_score'] > 0.5) & (matches_df['area_score'] <= 0.8)).sum()
    print(f"  Good area matches (>80%): {good_area} ({good_area/len(matches_df)*100:.1f}%)")
    print(f"  Fair area matches (50-80%): {fair_area} ({fair_area/len(matches_df)*100:.1f}%)")
    
else:
    print("No matches to analyze")


In [None]:
# Export results to CSV
if len(matches_df) > 0:
    output_file = 'listing_cadastral_matches.csv'
    matches_df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"✅ Results exported to {output_file}")
    
    # Also create a summary report
    summary_stats = {
        'total_listings_processed': len(processing_listings),
        'total_matches_found': len(matches_df),
        'match_rate_percent': round(len(matches_df)/len(processing_listings)*100, 1),
        'average_match_score': round(matches_df['match_score'].mean(), 3),
        'high_confidence_matches': (matches_df['match_score'] > 0.7).sum(),
        'perfect_geo_matches': (matches_df['geo_score'] == 1.0).sum()
    }
    
    print(f"\n📋 SUMMARY STATISTICS:")
    for key, value in summary_stats.items():
        print(f"  {key.replace('_', ' ').title()}: {value}")

else:
    print("No results to export")

print(f"\n🎯 NEXT STEPS AND IMPROVEMENTS:")
print(f"1. **Scale up**: Remove the .head(50) limit to process all listings")
print(f"2. **Fine-tune weights**: Adjust geographic/area/type weights based on domain knowledge")
print(f"3. **Add more criteria**: Include bedrooms, construction year, or postal codes")
print(f"4. **Handle duplicates**: Some cadastral parcels may match multiple listings")
print(f"5. **Manual validation**: Review high-score matches for accuracy")
print(f"6. **Geographic coordinates**: Use lat/lon for distance-based matching")
print(f"7. **Address matching**: Implement street address fuzzy matching when available")
