### This is our notebook where we modified and processed the data from the teacher.

Load the CSV, set pandas display options and preview the first 10 rows.

In [3]:
import pandas as pd
import requests
import time
import json
import os

file_path = "descriptives_assembled_with_geo_part.csv"
df = pd.read_csv(file_path)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None) 


df.head(10)

Unnamed: 0.1,Unnamed: 0,Name,Title,Purpose,Contact,Category,Domicile,Founding Date,Estimated assets in CHF,Foundation supervision,Website,Phone,Email,prompt_geo,geo
0,0,fundraiso_by_category/associationsoperative_sw...,marie meierhofer institut fur das kind,the mmi is committed to ensuring that every ch...,"['www.mmi.ch', '0041 44 205 52 20', 'info@mmi....",Associations operative,marie meierhofer institut fur das kindpfingstw...,,,,www.mmi.ch,0041 44 205 52 20,info@mmi.ch,Find the latitude and longitude geographical c...,"47.3947, 8.5275\n"
1,1,fundraiso_by_category/associationsoperative_sw...,pro lumerins,pro lumerins is a cultural association,"['www.lumnezia.ch', '0041 79 508 35 33', 'prol...",Associations operative,pro lumerins uniun da culturaramun capaul pre...,,,,www.lumnezia.ch,0041 79 508 35 33,prolumerins@gmail.com,Find the latitude and longitude geographical c...,"46.6824, 9.1443\n"
2,2,fundraiso_by_category/associationsoperative_sw...,evang ref kirchgemeinde ringgenberg,evang ref kirchgemeinde ringgenberg participat...,"['www.kircheringgenberg.ch', '0041 33 822 20 5...",Associations operative,evang ref kirchgemeinde ringgenbergkirchgasse ...,,,,www.kircheringgenberg.ch,0041 33 822 20 53,andreas.schiltknecht@kircheringgenberg.ch,Find the latitude and longitude geographical c...,"46.68473, 7.89413\n"
3,3,fundraiso_by_category/associationsoperative_sw...,sos enfants de chez nous,sos enfants de chez nous pays particular atten...,"['www.sosenfantsdecheznous.ch', '0041 79 606 2...",Associations operative,sos enfants de chez nousrue de loa che 47ch195...,,,,www.sosenfantsdecheznous.ch,0041 79 606 27 07,info@sosenfantsdecheznous.ch,Find the latitude and longitude geographical c...,"46.23224,7.36284\n"
4,4,fundraiso_by_category/associationsoperative_sw...,schweizerische multiple sklerose gesellschaft,the ms society aims in particular 1 to promote...,['www.multiplesklerose.ch'],Associations operative,schweizerische multiple sklerose gesellschaftj...,,,,www.multiplesklerose.ch,,,Find the latitude and longitude geographical c...,"47.3893, 8.5294\n"
5,5,fundraiso_by_category/associationsoperative_sw...,kleika arbeitslosenprojekte verein arbeitslose...,kleika offers unemployed women within the fram...,"['www.kleika.ch', '0041 71 222 38 88', 'info@k...",Associations operative,kleika arbeitslosenprojekte verein arbeitslose...,,,,www.kleika.ch,0041 71 222 38 88,info@kleika.ch,Find the latitude and longitude geographical c...,"47.4290, 9.3748\n"
6,6,fundraiso_by_category/associationsoperative_sw...,the festival of the 5 continents,an original concept in the valais the festival...,"['5continents.ch', '0041 27 721 22 30']",Associations operative,the festival of the 5 continentsplace du manoi...,,,,5continents.ch,0041 27 721 22 30,,Find the latitude and longitude geographical c...,"46.1018, 7.0717\n"
7,7,fundraiso_by_category/associationsoperative_sw...,pro aidants,the purpose of the association is networking o...,"['www.proaidants.ch', '0041445867964', 'info@p...",Associations operative,pro aidantsco impact hub zurich agsihlquai 131...,['21.10.2011'],,,www.proaidants.ch,41445867964,info@proaidants.ch,Find the latitude and longitude geographical c...,"47.3824, 8.5367\n"
8,8,fundraiso_by_category/associationsoperative_sw...,freiplatzaktion basel,the association is committed to safeguarding t...,"['freiplatzaktion-basel.ch', '0041 61 691 11 3...",Associations operative,freiplatzaktion baselelsasserstrasse 7ch4056 b...,,,,freiplatzaktion-basel.ch,0041 61 691 11 33,infos@freiplatzaktion-basel.ch,Find the latitude and longitude geographical c...,"47.5742,7.5785\n"
9,9,fundraiso_by_category/associationsoperative_sw...,association morija,help poor populations mainly in africa and the...,"['www.morija.org', '0041 24 472 80 70', 'info@...",Associations operative,association morijacase postale 73rte industrie...,,,,www.morija.org,0041 24 472 80 70,info@morija.org,Find the latitude and longitude geographical c...,"46.3788, 6.9133\n"


Extract lat/lon from geo strings, reverse geocode with cached OpenStreetMap calls, and write city/canton plus geocoded outputs.

In [None]:
def extract_lat_lon(geo_value):
    if isinstance(geo_value, str) and ',' in geo_value:
        parts = geo_value.strip().replace('\n', '').split(',')
        if len(parts) >= 2:
            try:
                lat = float(parts[0].strip())
                lon = float(parts[1].strip())
                return lat, lon
            except ValueError:
                return None, None
    return None, None

df[['latitude', 'longitude']] = df['geo'].apply(
    lambda x: pd.Series(extract_lat_lon(x))
)

cache_file = "geo_cache.json"
if os.path.exists(cache_file):
    with open(cache_file, "r", encoding="utf-8") as f:
        cache = json.load(f)
else:
    cache = {}


def reverse_geocode(lat, lon):
    """Return city and canton names using OpenStreetMap."""
    key = f"{lat},{lon}"
    if key in cache:
        return cache[key]

    try:
        url = f"https://nominatim.openstreetmap.org/reverse?lat={lat}&lon={lon}&format=json&addressdetails=1"
        headers = {"User-Agent": "GeoCheckerOptimized/1.0"}
        res = requests.get(url, headers=headers, timeout=10)
        if res.status_code == 200:
            data = res.json().get("address", {})
            city = data.get("city") or data.get("town") or data.get("village")
            canton = data.get("state")
            cache[key] = (city, canton)
        else:
            cache[key] = (None, None)
    except Exception:
        cache[key] = (None, None)

    if len(cache) % 100 == 0:
        with open(cache_file, "w", encoding="utf-8") as f:
            json.dump(cache, f, ensure_ascii=False, indent=2)

    time.sleep(1) 
    return cache[key]

cities, cantons = [], []
for i, row in enumerate(df.itertuples(index=False), start=1):
    lat, lon = getattr(row, "latitude"), getattr(row, "longitude")
    city, canton = reverse_geocode(lat, lon)
    cities.append(city)
    cantons.append(canton)

    if i % 500 == 0:
        df_partial = df.iloc[:i].copy()
        df_partial["City"] = cities
        df_partial["Canton"] = cantons
        df_partial.to_csv("checkpoint_geocoding.csv", index=False)
        print(f"Checkpoint saved at row {i}")

df["City"] = cities
df["Canton"] = cantons

columns_to_save = [
    "Title",
    "Purpose",
    "Contact",
    "Category",
    "Domicile",
    "Website",
    "Phone",
    "Email",
    "geo",
    "City",
    "Canton"
]
columns_to_save = [c for c in columns_to_save if c in df.columns]

df[columns_to_save].to_csv("final_geocoded_data.csv", index=False)
df[columns_to_save].to_json("final_geocoded_data.json", orient="records", indent=2, force_ascii=False)


‚úÖ Checkpoint saved at row 500
‚úÖ Checkpoint saved at row 1000
‚úÖ Checkpoint saved at row 1500
‚úÖ Checkpoint saved at row 2000
‚úÖ Checkpoint saved at row 2500
‚úÖ Checkpoint saved at row 3000
‚úÖ Checkpoint saved at row 3500
‚úÖ Checkpoint saved at row 4000
‚úÖ Checkpoint saved at row 4500
‚úÖ Checkpoint saved at row 5000
‚úÖ Checkpoint saved at row 5500
‚úÖ Checkpoint saved at row 6000
‚úÖ Checkpoint saved at row 6500
‚úÖ Checkpoint saved at row 7000
‚úÖ Checkpoint saved at row 7500
‚úÖ Checkpoint saved at row 8000
‚úÖ Checkpoint saved at row 8500
‚úÖ Checkpoint saved at row 9000
‚úÖ Checkpoint saved at row 9500
‚úÖ Checkpoint saved at row 10000
‚úÖ Checkpoint saved at row 10500
‚úÖ Checkpoint saved at row 11000
‚úÖ Checkpoint saved at row 11500
‚úÖ Checkpoint saved at row 12000
‚úÖ Checkpoint saved at row 12500
‚úÖ Checkpoint saved at row 13000
‚úÖ Checkpoint saved at row 13500
‚úÖ Checkpoint saved at row 14000
‚úÖ Checkpoint saved at row 14500
‚úÖ Checkpoint saved at row 15000


Reload the geocoded CSV and preview the first rows.

In [2]:
file_path = "final_geocoded_data.csv"
df = pd.read_csv(file_path)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None) 


df.head(10)

Unnamed: 0,Title,Purpose,Contact,Category,Domicile,Website,Phone,Email,geo,City,Canton
0,marie meierhofer institut fur das kind,the mmi is committed to ensuring that every ch...,"['www.mmi.ch', '0041 44 205 52 20', 'info@mmi....",Associations operative,marie meierhofer institut fur das kindpfingstw...,www.mmi.ch,0041 44 205 52 20,info@mmi.ch,"47.3947, 8.5275\n",Z√ºrich,Z√ºrich
1,pro lumerins,pro lumerins is a cultural association,"['www.lumnezia.ch', '0041 79 508 35 33', 'prol...",Associations operative,pro lumerins uniun da culturaramun capaul pre...,www.lumnezia.ch,0041 79 508 35 33,prolumerins@gmail.com,"46.6824, 9.1443\n",Lumnezia,Graub√ºnden/Grischun/Grigioni
2,evang ref kirchgemeinde ringgenberg,evang ref kirchgemeinde ringgenberg participat...,"['www.kircheringgenberg.ch', '0041 33 822 20 5...",Associations operative,evang ref kirchgemeinde ringgenbergkirchgasse ...,www.kircheringgenberg.ch,0041 33 822 20 53,andreas.schiltknecht@kircheringgenberg.ch,"46.68473, 7.89413\n",B√∂nigen,Bern/Berne
3,sos enfants de chez nous,sos enfants de chez nous pays particular atten...,"['www.sosenfantsdecheznous.ch', '0041 79 606 2...",Associations operative,sos enfants de chez nousrue de loa che 47ch195...,www.sosenfantsdecheznous.ch,0041 79 606 27 07,info@sosenfantsdecheznous.ch,"46.23224,7.36284\n",Sion,Valais/Wallis
4,schweizerische multiple sklerose gesellschaft,the ms society aims in particular 1 to promote...,['www.multiplesklerose.ch'],Associations operative,schweizerische multiple sklerose gesellschaftj...,www.multiplesklerose.ch,,,"47.3893, 8.5294\n",Z√ºrich,Z√ºrich
5,kleika arbeitslosenprojekte verein arbeitslose...,kleika offers unemployed women within the fram...,"['www.kleika.ch', '0041 71 222 38 88', 'info@k...",Associations operative,kleika arbeitslosenprojekte verein arbeitslose...,www.kleika.ch,0041 71 222 38 88,info@kleika.ch,"47.4290, 9.3748\n",St. Gallen,St. Gallen
6,the festival of the 5 continents,an original concept in the valais the festival...,"['5continents.ch', '0041 27 721 22 30']",Associations operative,the festival of the 5 continentsplace du manoi...,5continents.ch,0041 27 721 22 30,,"46.1018, 7.0717\n",Martigny,Valais/Wallis
7,pro aidants,the purpose of the association is networking o...,"['www.proaidants.ch', '0041445867964', 'info@p...",Associations operative,pro aidantsco impact hub zurich agsihlquai 131...,www.proaidants.ch,41445867964,info@proaidants.ch,"47.3824, 8.5367\n",Z√ºrich,Z√ºrich
8,freiplatzaktion basel,the association is committed to safeguarding t...,"['freiplatzaktion-basel.ch', '0041 61 691 11 3...",Associations operative,freiplatzaktion baselelsasserstrasse 7ch4056 b...,freiplatzaktion-basel.ch,0041 61 691 11 33,infos@freiplatzaktion-basel.ch,"47.5742,7.5785\n",Basel,Basel-Stadt
9,association morija,help poor populations mainly in africa and the...,"['www.morija.org', '0041 24 472 80 70', 'info@...",Associations operative,association morijacase postale 73rte industrie...,www.morija.org,0041 24 472 80 70,info@morija.org,"46.3788, 6.9133\n",Crebelley,Vaud


Check website availability in parallel threads, label each status, show a sample and save the results.

In [None]:
import pandas as pd
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

file_path = "final_geocoded_data.csv"
df = pd.read_csv(file_path)

def check_website(url):
    if not isinstance(url, str) or not url.strip():
        return "No URL"

    if not url.startswith(("http://", "https://")):
        url = "http://" + url.strip()
    
    try:
        response = requests.head(url, timeout=5, allow_redirects=True)
        if response.status_code == 200:
            return "Working"
        else:
            return f"Broken ({response.status_code})"
    except requests.exceptions.RequestException:
        return "Invalid"

def check_websites_parallel(urls, max_workers=50):
    """Check websites concurrently using threads."""
    results = [None] * len(urls)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_idx = {executor.submit(check_website, url): i for i, url in enumerate(urls)}
        for future in as_completed(future_to_idx):
            i = future_to_idx[future]
            try:
                results[i] = future.result()
            except Exception:
                results[i] = "Error"
    return results

df["Website_Status"] = check_websites_parallel(df["Website"].tolist(), max_workers=50)

display(df[["Title", "City", "Canton", "Website", "Website_Status"]].head(10))

df.to_csv("final_geocoded_with_website_status_fast.csv", index=False)
df.to_json("final_geocoded_with_website_status_fast.json", orient="records", indent=2)


Checking websites in parallel... (this may take a few minutes)


Unnamed: 0,Title,City,Canton,Website,Website_Status
0,marie meierhofer institut fur das kind,Z√ºrich,Z√ºrich,www.mmi.ch,Working
1,pro lumerins,Lumnezia,Graub√ºnden/Grischun/Grigioni,www.lumnezia.ch,Working
2,evang ref kirchgemeinde ringgenberg,B√∂nigen,Bern/Berne,www.kircheringgenberg.ch,Working
3,sos enfants de chez nous,Sion,Valais/Wallis,www.sosenfantsdecheznous.ch,Working
4,schweizerische multiple sklerose gesellschaft,Z√ºrich,Z√ºrich,www.multiplesklerose.ch,Working
5,kleika arbeitslosenprojekte verein arbeitslose...,St. Gallen,St. Gallen,www.kleika.ch,Working
6,the festival of the 5 continents,Martigny,Valais/Wallis,5continents.ch,Working
7,pro aidants,Z√ºrich,Z√ºrich,www.proaidants.ch,Invalid
8,freiplatzaktion basel,Basel,Basel-Stadt,freiplatzaktion-basel.ch,Working
9,association morija,Crebelley,Vaud,www.morija.org,Working



 Done! Files saved as:
 - final_geocoded_with_website_status_fast.csv
 - final_geocoded_with_website_status_fast.json


Filter to working sites, report counts removed and save the filtered data.

In [6]:

valid_mask = df["Website_Status"].str.contains("Working", na=False)
df_valid = df[valid_mask].copy()

print(f"\n{len(df_valid)} out of {len(df)} websites are working.")
print(f"{len(df) - len(df_valid)} rows removed due to broken or invalid websites.\n")

display(df_valid[["Title", "City", "Canton", "Website", "Website_Status"]].head(10))

df_valid.to_csv("final_geocoded_only_working_websites.csv", index=False)
df_valid.to_json("final_geocoded_only_working_websites.json", orient="records", indent=2)




10149 out of 19992 websites are working.
9843 rows removed due to broken or invalid websites.



Unnamed: 0,Title,City,Canton,Website,Website_Status
0,marie meierhofer institut fur das kind,Z√ºrich,Z√ºrich,www.mmi.ch,Working
1,pro lumerins,Lumnezia,Graub√ºnden/Grischun/Grigioni,www.lumnezia.ch,Working
2,evang ref kirchgemeinde ringgenberg,B√∂nigen,Bern/Berne,www.kircheringgenberg.ch,Working
3,sos enfants de chez nous,Sion,Valais/Wallis,www.sosenfantsdecheznous.ch,Working
4,schweizerische multiple sklerose gesellschaft,Z√ºrich,Z√ºrich,www.multiplesklerose.ch,Working
5,kleika arbeitslosenprojekte verein arbeitslose...,St. Gallen,St. Gallen,www.kleika.ch,Working
6,the festival of the 5 continents,Martigny,Valais/Wallis,5continents.ch,Working
8,freiplatzaktion basel,Basel,Basel-Stadt,freiplatzaktion-basel.ch,Working
9,association morija,Crebelley,Vaud,www.morija.org,Working
10,kirche in not acn schweiz,Luzern,Luzern,kirche-in-not.ch,Working


Count missing/empty Title/Website/geo fields, drop those rows, report removals and save the cleaned file.

In [7]:
cols_to_check = ["Title", "Website", "geo"]

missing_before = df_valid[cols_to_check].isna().sum().to_dict()
empty_before = {
    col: (df_valid[col].astype(str).str.strip() == "").sum()
    for col in cols_to_check
}
total_missing_before = {col: missing_before[col] + empty_before[col] for col in cols_to_check}

print("\nMissing/Empty values before cleaning:")
for col, count in total_missing_before.items():
    print(f" - {col}: {count}")

rows_before = len(df_valid)
df_valid = df_valid.dropna(subset=cols_to_check)
for col in cols_to_check:
    df_valid = df_valid[~df_valid[col].astype(str).str.strip().eq("")]

rows_after = len(df_valid)
rows_dropped = rows_before - rows_after

print(f"\nRemoved {rows_dropped} rows with missing or empty Title/Website/geo.")
print(f"Remaining rows: {rows_after}")

df_valid.to_csv("final_geocoded_cleaned.csv", index=False)
df_valid.to_json("final_geocoded_cleaned.json", orient="records", indent=2)




Missing/Empty values before cleaning:
 - Title: 1
 - Website: 0
 - geo: 0

Removed 1 rows with missing or empty Title/Website/geo.
Remaining rows: 10148


Reload the cleaned file and preview the first 10 rows.

In [4]:
file_path = "final_geocoded_cleaned.csv"
df = pd.read_csv(file_path)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None) 


df.head(10)

Unnamed: 0,Title,Purpose,Contact,Category,Domicile,Website,Phone,Email,geo,City,Canton,Website_Status
0,marie meierhofer institut fur das kind,the mmi is committed to ensuring that every ch...,"['www.mmi.ch', '0041 44 205 52 20', 'info@mmi....",Associations operative,marie meierhofer institut fur das kindpfingstw...,www.mmi.ch,0041 44 205 52 20,info@mmi.ch,"47.3947, 8.5275\n",Z√ºrich,Z√ºrich,Working
1,pro lumerins,pro lumerins is a cultural association,"['www.lumnezia.ch', '0041 79 508 35 33', 'prol...",Associations operative,pro lumerins uniun da culturaramun capaul pre...,www.lumnezia.ch,0041 79 508 35 33,prolumerins@gmail.com,"46.6824, 9.1443\n",Lumnezia,Graub√ºnden/Grischun/Grigioni,Working
2,evang ref kirchgemeinde ringgenberg,evang ref kirchgemeinde ringgenberg participat...,"['www.kircheringgenberg.ch', '0041 33 822 20 5...",Associations operative,evang ref kirchgemeinde ringgenbergkirchgasse ...,www.kircheringgenberg.ch,0041 33 822 20 53,andreas.schiltknecht@kircheringgenberg.ch,"46.68473, 7.89413\n",B√∂nigen,Bern/Berne,Working
3,sos enfants de chez nous,sos enfants de chez nous pays particular atten...,"['www.sosenfantsdecheznous.ch', '0041 79 606 2...",Associations operative,sos enfants de chez nousrue de loa che 47ch195...,www.sosenfantsdecheznous.ch,0041 79 606 27 07,info@sosenfantsdecheznous.ch,"46.23224,7.36284\n",Sion,Valais/Wallis,Working
4,schweizerische multiple sklerose gesellschaft,the ms society aims in particular 1 to promote...,['www.multiplesklerose.ch'],Associations operative,schweizerische multiple sklerose gesellschaftj...,www.multiplesklerose.ch,,,"47.3893, 8.5294\n",Z√ºrich,Z√ºrich,Working
5,kleika arbeitslosenprojekte verein arbeitslose...,kleika offers unemployed women within the fram...,"['www.kleika.ch', '0041 71 222 38 88', 'info@k...",Associations operative,kleika arbeitslosenprojekte verein arbeitslose...,www.kleika.ch,0041 71 222 38 88,info@kleika.ch,"47.4290, 9.3748\n",St. Gallen,St. Gallen,Working
6,the festival of the 5 continents,an original concept in the valais the festival...,"['5continents.ch', '0041 27 721 22 30']",Associations operative,the festival of the 5 continentsplace du manoi...,5continents.ch,0041 27 721 22 30,,"46.1018, 7.0717\n",Martigny,Valais/Wallis,Working
7,freiplatzaktion basel,the association is committed to safeguarding t...,"['freiplatzaktion-basel.ch', '0041 61 691 11 3...",Associations operative,freiplatzaktion baselelsasserstrasse 7ch4056 b...,freiplatzaktion-basel.ch,0041 61 691 11 33,infos@freiplatzaktion-basel.ch,"47.5742,7.5785\n",Basel,Basel-Stadt,Working
8,association morija,help poor populations mainly in africa and the...,"['www.morija.org', '0041 24 472 80 70', 'info@...",Associations operative,association morijacase postale 73rte industrie...,www.morija.org,0041 24 472 80 70,info@morija.org,"46.3788, 6.9133\n",Crebelley,Vaud,Working
9,kirche in not acn schweiz,the aim of the association is to carry out an ...,"['kirche-in-not.ch', '0041 41 410 46 70', 'mai...",Associations operative,kirche in not acn schweizcysatstrasse 6ch6004 ...,kirche-in-not.ch,0041 41 410 46 70,mail@kirche-in-not.ch,"47.0502, 8.3065\n",Luzern,Luzern,Working


Export a trimmed set of key columns to CSV/JSON for downstream use.

In [None]:
columns_to_save = [
    "Title",
    "Purpose",
    "Website",
    "Phone",
    "Email",
    "City",
    "Canton"
]
columns_to_save = [c for c in columns_to_save if c in df.columns]
df[columns_to_save].to_csv("final_geocoded_data.csv", index=False)
df[columns_to_save].to_json("final_geocoded_data.json", orient="records", indent=2, force_ascii=False)

Reload the trimmed dataset and show a quick preview.

In [8]:
file_path = "final_geocoded_data.csv"
df = pd.read_csv(file_path)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None) 
df.head(10)

Unnamed: 0,Title,Purpose,Website,Phone,Email,City,Canton
0,marie meierhofer institut fur das kind,the mmi is committed to ensuring that every ch...,www.mmi.ch,0041 44 205 52 20,info@mmi.ch,Z√ºrich,Z√ºrich
1,pro lumerins,pro lumerins is a cultural association,www.lumnezia.ch,0041 79 508 35 33,prolumerins@gmail.com,Lumnezia,Graub√ºnden/Grischun/Grigioni
2,evang ref kirchgemeinde ringgenberg,evang ref kirchgemeinde ringgenberg participat...,www.kircheringgenberg.ch,0041 33 822 20 53,andreas.schiltknecht@kircheringgenberg.ch,B√∂nigen,Bern/Berne
3,sos enfants de chez nous,sos enfants de chez nous pays particular atten...,www.sosenfantsdecheznous.ch,0041 79 606 27 07,info@sosenfantsdecheznous.ch,Sion,Valais/Wallis
4,schweizerische multiple sklerose gesellschaft,the ms society aims in particular 1 to promote...,www.multiplesklerose.ch,,,Z√ºrich,Z√ºrich
5,kleika arbeitslosenprojekte verein arbeitslose...,kleika offers unemployed women within the fram...,www.kleika.ch,0041 71 222 38 88,info@kleika.ch,St. Gallen,St. Gallen
6,the festival of the 5 continents,an original concept in the valais the festival...,5continents.ch,0041 27 721 22 30,,Martigny,Valais/Wallis
7,freiplatzaktion basel,the association is committed to safeguarding t...,freiplatzaktion-basel.ch,0041 61 691 11 33,infos@freiplatzaktion-basel.ch,Basel,Basel-Stadt
8,association morija,help poor populations mainly in africa and the...,www.morija.org,0041 24 472 80 70,info@morija.org,Crebelley,Vaud
9,kirche in not acn schweiz,the aim of the association is to carry out an ...,kirche-in-not.ch,0041 41 410 46 70,mail@kirche-in-not.ch,Luzern,Luzern


Classify Purpose text into categories, preview the updated data and resave the CSV/JSON.

In [None]:
import pandas as pd

file_path = "final_geocoded_data.csv"
df = pd.read_csv(file_path)

# --- keyword-based category classifier ---
def categorize_purpose(purpose):
    if not isinstance(purpose, str):
        return "Other"
    p = purpose.lower()

    if any(word in p for word in ["restaurant", "cafe", "bar", "hotel", "food", "catering", "bistro", "pub"]):
        return "Restaurant / Hospitality"

    if any(word in p for word in ["shop", "store", "retail", "commerce", "market", "boutique", "mall", "company", "enterprise"]):
        return "Commercial / Retail"

    if any(word in p for word in ["school", "university", "academy", "education", "training", "college", "institute", "kindergarten"]):
        return "Education"

    if any(word in p for word in ["hospital", "clinic", "health", "therapy", "medical", "doctor", "nursing", "pharmacy", "care", "wellness"]):
        return "Healthcare / Wellness"

    if any(word in p for word in ["foundation", "association", "charity", "ngo", "non-profit", "volunteer", "aid", "fund", "support", "social"]):
        return "Non-Profit / Charity"

    if any(word in p for word in ["event", "festival", "concert", "sport", "theater", "museum", "gallery", "club", "culture", "music"]):
        return "Event / Culture / Sports"

    if any(word in p for word in ["municipal", "government", "council", "administration", "office", "ministry", "public"]):
        return "Government / Administration"

    if any(word in p for word in ["software", "it", "tech", "digital", "engineering", "mechanical", "electrical", "industrial", "automation"]):
        return "Technology / Industry"

    if any(word in p for word in ["environment", "agriculture", "forest", "farm", "sustainability", "eco", "climate", "nature"]):
        return "Environment / Agriculture"
    return "Other"

df["Category"] = df["Purpose"].apply(categorize_purpose)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df.head(10)

df.to_csv("final_geocoded_data.csv", index=False)
df.to_json("final_geocoded_data.json", orient="records", indent=2, force_ascii=False)


Unnamed: 0,Title,Purpose,Website,Phone,Email,City,Canton,Category
0,marie meierhofer institut fur das kind,the mmi is committed to ensuring that every ch...,www.mmi.ch,0041 44 205 52 20,info@mmi.ch,Z√ºrich,Z√ºrich,Restaurant / Hospitality
1,pro lumerins,pro lumerins is a cultural association,www.lumnezia.ch,0041 79 508 35 33,prolumerins@gmail.com,Lumnezia,Graub√ºnden/Grischun/Grigioni,Non-Profit / Charity
2,evang ref kirchgemeinde ringgenberg,evang ref kirchgemeinde ringgenberg participat...,www.kircheringgenberg.ch,0041 33 822 20 53,andreas.schiltknecht@kircheringgenberg.ch,B√∂nigen,Bern/Berne,Healthcare / Wellness
3,sos enfants de chez nous,sos enfants de chez nous pays particular atten...,www.sosenfantsdecheznous.ch,0041 79 606 27 07,info@sosenfantsdecheznous.ch,Sion,Valais/Wallis,Education
4,schweizerische multiple sklerose gesellschaft,the ms society aims in particular 1 to promote...,www.multiplesklerose.ch,,,Z√ºrich,Z√ºrich,Restaurant / Hospitality
5,kleika arbeitslosenprojekte verein arbeitslose...,kleika offers unemployed women within the fram...,www.kleika.ch,0041 71 222 38 88,info@kleika.ch,St. Gallen,St. Gallen,Commercial / Retail
6,the festival of the 5 continents,an original concept in the valais the festival...,5continents.ch,0041 27 721 22 30,,Martigny,Valais/Wallis,Restaurant / Hospitality
7,freiplatzaktion basel,the association is committed to safeguarding t...,freiplatzaktion-basel.ch,0041 61 691 11 33,infos@freiplatzaktion-basel.ch,Basel,Basel-Stadt,Non-Profit / Charity
8,association morija,help poor populations mainly in africa and the...,www.morija.org,0041 24 472 80 70,info@morija.org,Crebelley,Vaud,Non-Profit / Charity
9,kirche in not acn schweiz,the aim of the association is to carry out an ...,kirche-in-not.ch,0041 41 410 46 70,mail@kirche-in-not.ch,Luzern,Luzern,Non-Profit / Charity
