In [19]:
import geopandas as gpd
from pathlib import Path
from ftfy import fix_text

In [31]:
test_data = gpd.read_file("/app/data/test_data.zip", encoding='utf-8')

In [None]:
test_data.head()

In [None]:
# Function to fix encoding in a DataFrame column
def fix_column_encoding(df, column):
    if df[column].dtype == 'object':  # Only process string/object columns
        df[column] = df[column].astype(str).apply(fix_text)
    return df

# Fix encoding in all string columns
for column in test_data.columns:
    if test_data[column].dtype == 'object':
        test_data = fix_column_encoding(test_data, column)

# Check a sample of the data to verify
test_data.head()

In [26]:
test_data = gpd.read_file("/app/data/test_data.zip")

# Fix all string columns with the deep fix
for column in test_data.columns:
    if test_data[column].dtype == 'object':
        test_data[column] = test_data[column].apply(deep_fix_encoding)

In [None]:
test_data.head()

In [None]:
from pyproj import Transformer

# Define coordinates in EPSG:4326 (WGS84)
wgs_coords = {
    "minx": 20.6455928891,
    "maxx": 31.5160921567,
    "miny": 59.846373196,
    "maxy": 70.1641930203,
}

# Create a transformer from EPSG:4326 to EPSG:3067
transformer = Transformer.from_crs("EPSG:4326", "EPSG:3067", always_xy=True)

# Transform each corner point
sw = transformer.transform(wgs_coords["minx"], wgs_coords["miny"])
se = transformer.transform(wgs_coords["maxx"], wgs_coords["miny"])
ne = transformer.transform(wgs_coords["maxx"], wgs_coords["maxy"])
nw = transformer.transform(wgs_coords["minx"], wgs_coords["maxy"])

# Get the bounding box in ETRS89/TM35FIN coordinates
etrs_coords = {
    "minx": min(sw[0], nw[0]),
    "maxx": max(se[0], ne[0]),
    "miny": min(sw[1], se[1]),
    "maxy": max(nw[1], ne[1]),
}

print("EPSG:3067 coordinates (ETRS89/TM35FIN):")
print(f"minx: {etrs_coords['minx']}")
print(f"maxx: {etrs_coords['maxx']}")
print(f"miny: {etrs_coords['miny']}")
print(f"maxy: {etrs_coords['maxy']}")

EPSG:3067 coordinates (ETRS89/TM35FIN):
minx: 144286.33218675363
maxx: 752934.2155768903
miny: 6642928.395443255
maxy: 7796732.440183549


In [None]:
import geopandas as gpd
from pathlib import Path
import zipfile

# Path to the original shapefile (update as needed)
shp_path = Path('../data/test_data.zip')
gdf = gpd.read_file(shp_path)

# Columns for name and municipality (update as needed)
name_col = 'nimi'
mun_col = 'kunta'

print(f"Original data has {len(gdf)} features")

# Find all (name, municipality) combos that are unique
combo_counts = gdf.groupby([name_col, mun_col]).size().reset_index(name='count')
unique_combos = combo_counts[combo_counts['count'] == 1][[name_col, mun_col]]
non_unique_combos = combo_counts[combo_counts['count'] > 1]

print(f"\nFound {len(non_unique_combos)} non-unique name+municipality combinations:")
print("Name + Municipality combinations being removed (with count):")
for _, row in non_unique_combos.iterrows():
    print(f"  '{row[name_col]}' + '{row[mun_col]}': {row['count']} features")

# Merge to keep only unique combos
gdf_unique = gdf.merge(unique_combos, on=[name_col, mun_col], how='inner')

print(f"\nAfter removing duplicates: {len(gdf_unique)} features remaining")
print(f"Removed {len(gdf) - len(gdf_unique)} features total")

# Save to a new shapefile
out_dir = Path('../data/unique_name_mun')
out_dir.mkdir(exist_ok=True)
out_shp = out_dir / 'unique_name_mun.shp'
gdf_unique.to_file(out_shp)

# Zip the shapefile
zip_path = Path('../data/unique_name_mun.zip')
with zipfile.ZipFile(zip_path, 'w') as zf:
    for ext in ['.shp', '.shx', '.dbf', '.prj', '.cpg']:
        f = out_shp.with_suffix(ext)
        if f.exists():
            zf.write(f, f.name)

print(f"\nSaved unique name+municipality shapefile to {zip_path}")

Saved unique name+municipality shapefile to ../data/unique_name_mun.zip


  ogr_write(
