In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_geonames_page(start_row):
    url = "https://www.geonames.org/search.html"
    params = {
        "q": "algiers",
        "startRow": start_row
    }

    print(f"Scraping startRow={start_row} ...")
    response = requests.get(url, params=params)
    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find("table", class_="restable")
    rows = table.find_all("tr")[1:]  # skip header

    places = []
    for row in rows:
        cols = row.find_all("td")
        if len(cols) < 6:
            continue
        name = cols[0].text.strip()
        country = cols[1].text.strip()
        feature_class = cols[2].text.strip()
        population = cols[3].text.strip().replace(",", "")
        latitude = cols[4].text.strip()
        longitude = cols[5].text.strip()

        places.append({
            "Name": name,
            "Country": country,
            "Feature Class": feature_class,
            "Population": population,
            "Latitude": latitude,
            "Longitude": longitude
        })

    return places

# Collect data from 4 pages
all_places = []
for start in [0, 50, 100, 150]:
    all_places.extend(scrape_geonames_page(start))
    time.sleep(1)  # be polite to the server

# Save to CSV
df = pd.DataFrame(all_places)
df.to_csv("geonames_algiers_0_150.csv", index=False)
print("Saved to geonames_algiers_0_150.csv")


Scraping startRow=0 ...
Scraping startRow=50 ...
Scraping startRow=100 ...
Scraping startRow=150 ...
Saved to geonames_algiers_0_150.csv


In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.citypopulation.de/en/algeria/admin/16__el_djaza%C3%AFr/"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

table = soup.find("table", class_="data")
rows = table.find_all("tr")[1:]  # skip header

data = []
for row in rows:
    cols = row.find_all("td")
    if len(cols) >= 5:
        name = cols[0].text.strip()
        status = cols[1].text.strip()
        pop_1998 = cols[2].text.strip().replace(",", "")
        pop_2008 = cols[3].text.strip().replace(",", "")
        native = cols[4].text.strip()

        # Only add rows where population values are valid numbers
        if pop_1998.isdigit() and pop_2008.isdigit():
            data.append({
                "Commune": name,
                "Status": status,
                "Population 1998": int(pop_1998),
                "Population 2008": int(pop_2008),
                "Native": native
            })

# Save to CSV
df = pd.DataFrame(data)
df.to_csv("algiers_population_by_commune.csv", index=False)
print("Saved to algiers_population_by_commune.csv")


Saved to algiers_population_by_commune.csv


In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

def scrape_algiers_population():
    """
    Scrape population data for Algiers communes from citypopulation.de
    """
    url = "https://www.citypopulation.de/en/algeria/admin/16__el_djaza%C3%AFr/"
    
    print("Fetching data from citypopulation.de...")
    
    # Send GET request with headers to avoid blocking
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Parse HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all tables with class 'data'
        tables = soup.find_all('table', {'class': 'data'})
        
        if not tables:
            print("Could not find any data tables on the page")
            return None
        
        # The main table with commune data is usually the largest one
        # Let's find all rows across all tables
        all_data = []
        
        for table in tables:
            # Find all rows in the table
            rows = table.find_all('tr')
            print(f"Found {len(rows)} rows in a table")
            
            # Skip header rows - we'll process each row and check if it has commune data
            for row in rows:
                # Check if this is a data row (not a header)
                cells = row.find_all(['td', 'th'])
                
                if len(cells) >= 5:  # We need at least 5 columns (name, status, native, pop1998, pop2008)
                    # Skip if this is a header row
                    if row.find('th'):
                        continue
                    
                    # Extract place name (first cell)
                    name_cell = cells[0]
                    
                    # Skip the province row (we want communes)
                    cell_text = name_cell.get_text().strip()
                    if "Province" in str(row):
                        continue
                        
                    # Get place name
                    place_name = None
                    name_link = name_cell.find('a')
                    if name_link:
                        place_name = name_link.get_text().strip()
                    else:
                        place_name = cell_text
                    
                    # Skip empty rows or header rows
                    if not place_name or place_name == "Name":
                        continue
                    
                    # Extract status (usually "Commune")
                    status = ""
                    if len(cells) > 1:
                        status = cells[1].get_text().strip()
                    
                    # Extract native name
                    native_name = ""
                    if len(cells) > 2:
                        native_name = cells[2].get_text().strip()
                    
                    # Extract population data
                    population_1998 = None
                    population_2008 = None
                    
                    # Population 1998 (usually 4th column)
                    if len(cells) > 3:
                        pop_text = cells[3].get_text().strip()
                        numbers = re.findall(r'[\d,]+', pop_text)
                        if numbers:
                            clean_number = numbers[0].replace(',', '')
                            if clean_number.isdigit():
                                population_1998 = int(clean_number)
                    
                    # Population 2008 (usually 5th column)
                    if len(cells) > 4:
                        pop_text = cells[4].get_text().strip()
                        numbers = re.findall(r'[\d,]+', pop_text)
                        if numbers:
                            clean_number = numbers[0].replace(',', '')
                            if clean_number.isdigit():
                                population_2008 = int(clean_number)
                    
                    # Add to our data list
                    all_data.append({
                        'place_name': place_name,
                        'status': status,
                        'native_name': native_name,
                        'population_1998': population_1998,
                        'population_2008': population_2008
                    })
        
        print(f"Successfully extracted data for {len(all_data)} places")
        
        # Create DataFrame
        df = pd.DataFrame(all_data)
        
        # Save to CSV
        csv_filename = 'algiers_population_data.csv'
        df.to_csv(csv_filename, index=False, encoding='utf-8')
        
        print(f"Data saved to {csv_filename}")
        print("\nFirst few rows of the dataset:")
        print(df.head(10))
        
        print(f"\nDataset summary:")
        print(f"Total places: {len(df)}")
        print(f"Places with 1998 data: {df['population_1998'].notna().sum()}")
        print(f"Places with 2008 data: {df['population_2008'].notna().sum()}")
        
        return df
        
    except requests.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return None
    except Exception as e:
        print(f"Error processing data: {e}")
        import traceback
        traceback.print_exc()
        return None

# Run the scraper
if __name__ == "__main__":
    df = scrape_algiers_population()
    
    if df is not None:
        print("\n" + "="*50)
        print("SCRAPING COMPLETED SUCCESSFULLY!")
        print("="*50)
        print(f"CSV file 'algiers_population_data.csv' has been created with {len(df)} records")
    else:
        print("Scraping failed. Please check the error messages above.")


Fetching data from citypopulation.de...
Found 60 rows in a table
Found 3 rows in a table
Found 4 rows in a table
Found 10 rows in a table
Found 4 rows in a table
Successfully extracted data for 58 places
Data saved to algiers_population_data.csv

First few rows of the dataset:
     place_name   status  native_name  population_1998  population_2008
0    Aïn Benian  Commune  عين البنيان            52343            68354
1      Aïn Taya  Commune     عين طاية            29515            34501
2   Baba Hassen  Commune     بابا حسن            13827            23756
3   Bab El Oued  Commune   باب الوادي            87557            64732
4   Bab Ezzouar  Commune   باب الزوار            92157            96597
5   Bachdjerrah  Commune     باش جراح            90073            93289
6        Baraki  Commune        براقي            95247           116375
7    Ben Aknoun  Commune     بن عكنون            19404            18838
8  Beni Messous  Commune     بني مسوس            17490            36191
9 

In [12]:
import pandas as pd
from fuzzywuzzy import fuzz, process
import re

def clean_place_name(name):
    """
    Clean and normalize place names for better matching
    """
    if pd.isna(name):
        return ""
    
    # Convert to string and strip whitespace
    name = str(name).strip()
    
    # Remove common prefixes/suffixes that might cause mismatches
    name = re.sub(r'\s*\[.*?\]\s*', '', name)  # Remove [Algiers] type suffixes
    name = re.sub(r'\s*$$.*?$$\s*', '', name)  # Remove (parentheses)
    
    # Normalize common variations
    name = name.replace('El ', '').replace('Al ', '').replace('el ', '').replace('al ', '')
    name = name.replace('-', ' ').replace('_', ' ')
    
    # Remove extra whitespace
    name = ' '.join(name.split())
    
    return name.lower()

def convert_dms_to_decimal(dms_str):
    """
    Convert degrees/minutes/seconds format to decimal degrees
    Example: "N 36° 43′ 56''" -> 36.732222
    """
    if pd.isna(dms_str) or not isinstance(dms_str, str):
        return None
    
    # Extract direction (N/S/E/W)
    direction = 1
    if 'S' in dms_str or 'W' in dms_str:
        direction = -1
    
    # Extract numbers using regex
    numbers = re.findall(r'\d+', dms_str)
    
    if len(numbers) >= 2:
        degrees = float(numbers[0])
        minutes = float(numbers[1])
        seconds = float(numbers[2]) if len(numbers) > 2 else 0
        
        decimal = degrees + minutes/60 + seconds/3600
        return decimal * direction
    
    return None

def extract_place_names(name_field):
    """
    Extract all possible place names from the complex name field
    """
    if pd.isna(name_field):
        return []
    
    name_str = str(name_field)
    names = []
    
    # Split by common separators
    parts = re.split(r'[,;|]', name_str)
    
    for part in parts:
        # Clean each part
        clean_part = part.strip()
        
        # Remove coordinates if embedded
        clean_part = re.sub(r'\d+\.\d+', '', clean_part)
        
        # Remove extra whitespace
        clean_part = ' '.join(clean_part.split())
        
        if clean_part and len(clean_part) > 2:
            names.append(clean_part)
    
    return names

def normalize_name(name):
    """
    Normalize place names for better matching
    """
    if pd.isna(name) or not name:
        return ""
    
    name = str(name).strip().lower()
    
    # Remove common prefixes and suffixes
    name = re.sub(r'\s*\[.*?\]\s*', '', name)
    name = re.sub(r'\s*$$.*?$$\s*', '', name)
    
    # Normalize common Arabic transliterations
    replacements = {
        'aïn': 'ain', 'aín': 'ain', 'aîn': 'ain',
        'bab': 'bab', 'baab': 'bab',
        'el ': '', 'al ': '', 'ed ': '', 'ad ': '',
        'dj': 'j', 'dz': 'z',
        'ou': 'u', 'oo': 'u',
        'kh': 'h', 'gh': 'g',
        'ï': 'i', 'î': 'i', 'í': 'i',
        'é': 'e', 'è': 'e', 'ê': 'e',
        'ç': 'c', 'ş': 's', 'ğ': 'g'
    }
    
    for old, new in replacements.items():
        name = name.replace(old, new)
    
    # Remove extra spaces and special characters
    name = re.sub(r'[^\w\s]', ' ', name)
    name = ' '.join(name.split())
    
    return name

def smart_match(pop_name, geo_names_list, threshold=60):
    """
    Smart matching using multiple strategies
    """
    if not pop_name or not geo_names_list:
        return None, 0
    
    normalized_pop = normalize_name(pop_name)
    best_match = None
    best_score = 0
    best_original = None
    
    for geo_entry in geo_names_list:
        geo_names = extract_place_names(geo_entry)
        
        for geo_name in geo_names:
            normalized_geo = normalize_name(geo_name)
            
            # Strategy 1: Exact match after normalization
            if normalized_pop == normalized_geo:
                return geo_entry, 100
            
            # Strategy 2: One name contains the other
            if normalized_pop in normalized_geo or normalized_geo in normalized_pop:
                score = 90
                if score > best_score:
                    best_score = score
                    best_match = geo_entry
                    best_original = geo_name
            
            # Strategy 3: Fuzzy matching
            ratio_score = fuzz.ratio(normalized_pop, normalized_geo)
            partial_score = fuzz.partial_ratio(normalized_pop, normalized_geo)
            token_score = fuzz.token_sort_ratio(normalized_pop, normalized_geo)
            
            # Use the highest score
            max_score = max(ratio_score, partial_score, token_score)
            
            if max_score > best_score and max_score >= threshold:
                best_score = max_score
                best_match = geo_entry
                best_original = geo_name
    
    return best_match, best_score

def merge_algiers_datasets():
    """
    Merge population data with geographic coordinates using smart matching
    """
    print("Loading datasets...")
    
    try:
        # Load population data
        pop_df = pd.read_csv('algiers_population_data.csv')
        print(f"Loaded population data: {len(pop_df)} records")
        print("Population data columns:", pop_df.columns.tolist())
        print("Sample population data:")
        print(pop_df.head())
        
        # Load geographic data
        geo_df = pd.read_csv('geonames_algiers_0_150.csv')
        print(f"\nLoaded geographic data: {len(geo_df)} records")
        print("Geographic data columns:", geo_df.columns.tolist())
        print("Sample geographic data:")
        print(geo_df.head())
        
    except FileNotFoundError as e:
        print(f"Error: Could not find file - {e}")
        print("Make sure both CSV files are in the same directory as this script")
        return None
    
    # Process geographic data to extract coordinates
    print("\nProcessing geographic coordinates...")
    
    processed_geo_data = []
    
    for idx, row in geo_df.iterrows():
        # Extract coordinates from Latitude and Longitude columns
        lat_decimal = convert_dms_to_decimal(row['Latitude'])
        lon_decimal = convert_dms_to_decimal(row['Longitude'])
        
        if lat_decimal is not None and lon_decimal is not None:
            processed_geo_data.append({
                'original_name': row['Name'],
                'latitude': lat_decimal,
                'longitude': lon_decimal,
                'all_names': extract_place_names(row['Name'])
            })
    
    print(f"Successfully processed {len(processed_geo_data)} geographic records with valid coordinates")
    
    # Show some examples of processed data
    print("\nSample processed geographic data:")
    for i, item in enumerate(processed_geo_data[:5]):
        print(f"{i+1}. Names: {item['all_names'][:3]}... -> Lat: {item['latitude']:.4f}, Lon: {item['longitude']:.4f}")
    
    # Prepare list of all geographic names for matching
    geo_names_list = [item['original_name'] for item in processed_geo_data]
    
    # Match population data with geographic data
    print(f"\nStarting smart matching process...")
    merged_data = []
    
    for idx, pop_row in pop_df.iterrows():
        place_name = pop_row['place_name']
        population_2008 = pop_row['population_2008']
        
        # Skip if no 2008 population data
        if pd.isna(population_2008):
            print(f"⚠ Skipping '{place_name}' - no 2008 population data")
            continue
        
        # Find best matching geographic location
        best_match, score = smart_match(place_name, geo_names_list)
        
        if best_match and score >= 60:
            # Find the corresponding processed geo data
            geo_item = next((item for item in processed_geo_data if item['original_name'] == best_match), None)
            
            if geo_item:
                merged_data.append({
                    'place_name': place_name,
                    'longitude': geo_item['longitude'],
                    'latitude': geo_item['latitude'],
                    'population_2008': int(population_2008),
                    'match_score': score,
                    'matched_with': best_match
                })
                
                print(f"✓ Matched '{place_name}' with '{best_match}' (score: {score})")
            else:
                print(f"✗ Found match but couldn't locate geo data for '{place_name}'")
        else:
            print(f"✗ No good match found for '{place_name}' (best score: {score})")
    
    if not merged_data:
        print("No matches found between datasets!")
        return None
    
    # Create final DataFrame
    final_df = pd.DataFrame(merged_data)
    
    # Sort by population (descending)
    final_df = final_df.sort_values('population_2008', ascending=False)
    
    # Create the final dataset with only requested columns
    result_df = final_df[['place_name', 'longitude', 'latitude', 'population_2008']].copy()
    
    # Save to CSV
    output_filename = 'algiers_combined_dataset.csv'
    result_df.to_csv(output_filename, index=False)
    
    print(f"\n" + "="*60)
    print("DATASET MERGING COMPLETED!")
    print("="*60)
    print(f"Successfully merged {len(result_df)} places")
    print(f"Output saved to: {output_filename}")
    
    print(f"\nFinal dataset preview:")
    print(result_df.head(10))
    
    print(f"\nDataset statistics:")
    print(f"Total places: {len(result_df)}")
    print(f"Average population: {result_df['population_2008'].mean():.0f}")
    if len(result_df) > 0:
        print(f"Largest city: {result_df.iloc[0]['place_name']} ({result_df.iloc[0]['population_2008']:,})")
        print(f"Smallest city: {result_df.iloc[-1]['place_name']} ({result_df.iloc[-1]['population_2008']:,})")
    
    # Show matching details
    print(f"\nMatching details:")
    match_details = final_df[['place_name', 'matched_with', 'match_score']].head(10)
    print(match_details)
    
    return result_df

# Run the merger
if __name__ == "__main__":
    result = merge_algiers_datasets()
    
    if result is not None:
        print(f"\n🎉 Success! Combined dataset created with {len(result)} records")
        print("File: algiers_combined_dataset.csv")
    else:
        print("❌ Failed to create combined dataset")


Loading datasets...
Loaded population data: 58 records
Population data columns: ['place_name', 'status', 'native_name', 'population_1998', 'population_2008']
Sample population data:
        place_name   status  native_name  population_1998  population_2008
0       Aïn Benian  Commune  عين البنيان            52343            68354
1         Aïn Taya  Commune     عين طاية            29515            34501
2      Baba Hassen  Commune     بابا حسن            13827            23756
3      Bab El Oued  Commune   باب الوادي            87557            64732
4      Bab Ezzouar  Commune   باب الزوار            92157            96597

Loaded geographic data: 200 records
Geographic data columns: ['Name', 'Country', 'Feature Class', 'Population', 'Latitude', 'Longitude']
Sample geographic data:
   Name                                            Country  \
0     1  Algiers  ALG,Al Jazair,Al-jezair,Alcher,Alge,A...   
1     2  Bab Ezzouar  Bab Ezzouar,Le Retour de la Chass...   
2     3  Algiers Air

In [17]:
import pandas as pd
import re

def clean_name(name):
    """Clean name by removing special characters and converting to lowercase"""
    if pd.isna(name):
        return ""
    
    # Convert to string and lowercase
    name = str(name).lower()
    
    # Remove special characters and keep only letters and spaces
    name = re.sub(r'[^a-zA-Z\s]', '', name)
    
    # Remove extra spaces
    name = re.sub(r'\s+', ' ', name).strip()
    
    return name

def has_4_successive_chars(name1, name2):
    """Check if two names share 4 successive characters"""
    if not name1 or not name2:
        return False
    
    name1_clean = clean_name(name1).replace(' ', '')  # Remove spaces for character matching
    name2_clean = clean_name(name2).replace(' ', '')
    
    # Check all 4-character substrings in name1
    for i in range(len(name1_clean) - 3):
        substring = name1_clean[i:i+4]
        if substring in name2_clean:
            print(f"  Found matching substring '{substring}' in '{name1}' and '{name2}'")
            return True
    
    return False

def extract_population(pop_str):
    """Extract population numbers from string"""
    if pd.isna(pop_str):
        return None
    
    try:
        # Remove asterisks and convert to int
        pop_str = str(pop_str).replace('*', '').replace(',', '')
        return int(pop_str) if pop_str.isdigit() else None
    except:
        return None

def parse_coordinates(coord_str):
    """Parse coordinate strings to decimal degrees"""
    if pd.isna(coord_str):
        return None
    
    try:
        coord_str = str(coord_str).strip()
        
        # Check if already decimal
        if '°' not in coord_str:
            return float(coord_str)
        
        # Parse degree format
        direction = 1
        if coord_str.startswith('S') or coord_str.startswith('W'):
            direction = -1
        
        # Extract numbers
        numbers = re.findall(r'\d+(?:\.\d+)?', coord_str)
        
        if len(numbers) >= 1:
            degrees = float(numbers[0])
            minutes = float(numbers[1]) if len(numbers) > 1 else 0
            seconds = float(numbers[2]) if len(numbers) > 2 else 0
            
            decimal = direction * (degrees + minutes/60 + seconds/3600)
            return decimal
    except:
        return None
    
    return None

def load_population_data(filepath):
    """Load and clean population dataset"""
    try:
        df = pd.read_csv(filepath)
        
        # Clean column names
        df.columns = df.columns.str.strip()
        
        # Process population columns
        if '*population_1998' in df.columns:
            df['population_1998'] = df['*population_1998'].apply(extract_population)
        
        if '*population_2008' in df.columns:
            df['population_2008'] = df['*population_2008'].apply(extract_population)
        
        return df
    except Exception as e:
        print(f"Error loading population data: {e}")
        return None

def load_geonames_data(filepath):
    """Load and clean geonames dataset"""
    try:
        df = pd.read_csv(filepath)
        
        # Clean column names
        df.columns = df.columns.str.strip()
        
        # Parse coordinates
        if '*Latitude' in df.columns:
            df['latitude'] = df['*Latitude'].apply(parse_coordinates)
        
        if '*Longitude*' in df.columns:
            df['longitude'] = df['*Longitude*'].apply(parse_coordinates)
        
        # Extract population from geonames
        if '*Population' in df.columns:
            df['geonames_population'] = df['*Population'].apply(extract_population)
        
        # Use the entire Name field for matching (contains all variations)
        df['main_name'] = df['Name'].astype(str)
        
        return df
    except Exception as e:
        print(f"Error loading geonames data: {e}")
        return None

def merge_datasets_simple(pop_filepath, geo_filepath, output_filepath=None):
    """Merge datasets using simple 4-character matching"""
    
    print("Loading population data...")
    pop_df = load_population_data(pop_filepath)
    if pop_df is None:
        return None
    
    print("Loading geonames data...")
    geo_df = load_geonames_data(geo_filepath)
    if geo_df is None:
        return None
    
    print(f"Population dataset: {len(pop_df)} records")
    print(f"Geonames dataset: {len(geo_df)} records")
    
    print("Matching places based on 4 successive characters...")
    
    merged_data = []
    matches_found = 0
    
    # For each place in population dataset
    for pop_idx, pop_row in pop_df.iterrows():
        place_name = pop_row['place_name']
        match_found = False
        
        # Check against all names in geonames dataset
        for geo_idx, geo_row in geo_df.iterrows():
            geo_name = geo_row['main_name']
            
            print(f"Checking: '{place_name}' against '{geo_name[:50]}...'")  # Show first 50 chars
            
            # Check if they share 4 successive characters
            if has_4_successive_chars(place_name, geo_name):
                # Create merged record
                merged_record = {
                    'place_name': place_name,
                    'matched_geoname': geo_name,
                    'status': pop_row.get('status', ''),
                    'native_name': pop_row.get('native_name', ''),
                    'population_1998': pop_row.get('population_1998', None),
                    'population_2008': pop_row.get('population_2008', None),
                    'geonames_population': geo_row.get('geonames_population', None),
                    'latitude': geo_row.get('latitude', None),
                    'longitude': geo_row.get('longitude', None)
                }
                merged_data.append(merged_record)
                match_found = True
                matches_found += 1
                print(f"Match found: '{place_name}' <-> '{geo_name}'")
                break  # Take first match only
        
        # If no match found, add record without coordinates
        if not match_found:
            merged_record = {
                'place_name': place_name,
                'matched_geoname': '',
                'status': pop_row.get('status', ''),
                'native_name': pop_row.get('native_name', ''),
                'population_1998': pop_row.get('population_1998', None),
                'population_2008': pop_row.get('population_2008', None),
                'geonames_population': None,
                'latitude': None,
                'longitude': None
            }
            merged_data.append(merged_record)
    
    # Create final dataframe
    merged_df = pd.DataFrame(merged_data)
    
    print(f"\nMatching complete!")
    print(f"Total records in merged dataset: {len(merged_df)}")
    print(f"Records with matches (coordinates): {matches_found}")
    print(f"Records without matches: {len(merged_df) - matches_found}")
    
    # Save to file if output path provided
    if output_filepath:
        merged_df.to_csv(output_filepath, index=False)
        print(f"Merged dataset saved to: {output_filepath}")
    
    return merged_df

# Usage
if __name__ == "__main__":
    # File paths
    population_file = r"C:\Users\user\OneDrive\Desktop\ibtikar_backend-main\Genetic_Algo\algiers_population_data.csv"
    geonames_file = r"C:\Users\user\OneDrive\Desktop\ibtikar_backend-main\Genetic_Algo\geonames_algiers_0_150.csv"
    output_file = r"C:\Users\user\OneDrive\Desktop\ibtikar_backend-main\Genetic_Algo\merged_algiers_data.csv"
    
    # Merge datasets
    merged_dataset = merge_datasets_simple(
        pop_filepath=population_file,
        geo_filepath=geonames_file,
        output_filepath=output_file
    )
    
    if merged_dataset is not None:
        print("\nSample of merged data:")
        print(merged_dataset[['place_name', 'matched_geoname', 'latitude', 'longitude', 'population_2008']].head(10))
        
        print("\nFinal dataset summary:")
        print(f"Total records: {len(merged_dataset)}")
        print(f"Records with coordinates: {merged_dataset['latitude'].notna().sum()}")
        print(f"Records with 2008 population: {merged_dataset['population_2008'].notna().sum()}")

Loading population data...
Loading geonames data...
Population dataset: 58 records
Geonames dataset: 200 records
Matching places based on 4 successive characters...
Checking: 'Aïn Benian' against '1...'
Checking: 'Aïn Benian' against '2...'
Checking: 'Aïn Benian' against '3...'
Checking: 'Aïn Benian' against '4...'
Checking: 'Aïn Benian' against '5...'
Checking: 'Aïn Benian' against '6...'
Checking: 'Aïn Benian' against '7...'
Checking: 'Aïn Benian' against '8...'
Checking: 'Aïn Benian' against '9...'
Checking: 'Aïn Benian' against '10...'
Checking: 'Aïn Benian' against '11...'
Checking: 'Aïn Benian' against '12...'
Checking: 'Aïn Benian' against '13...'
Checking: 'Aïn Benian' against '14...'
Checking: 'Aïn Benian' against '15...'
Checking: 'Aïn Benian' against '16...'
Checking: 'Aïn Benian' against '17...'
Checking: 'Aïn Benian' against '18...'
Checking: 'Aïn Benian' against '19...'
Checking: 'Aïn Benian' against '20...'
Checking: 'Aïn Benian' against '21...'
Checking: 'Aïn Benian' ag

In [18]:
import pandas as pd


In [20]:
df = pd.read_csv(r'C:\Users\user\OneDrive\Desktop\ibtikar_backend-main\Genetic_Algo\points.csv')


In [21]:
df

Unnamed: 0,@id,name,name:ar,network,network:wikidata,operator,operator:wikidata,public_transport,railway,tram,...,lat,alt_name:ft,alt_name:ar,alt_name:fr,name:fr,name:de,name:en,source,name:it,shelter
0,node/452418787,Cité Mokhtar Zerhouni,حي مختار زرهوني,ترامواي الجزائر,Q1688502,SETRAM,Q22688489,stop_position,tram_stop,yes,...,36.729478,,,,,,,,,
1,node/472084248,Cité Universitaire - CUB 1,الإقامة الجامعية - ح ج ب 1,ترامواي الجزائر,Q1688502,SETRAM,Q22688489,stop_position,tram_stop,yes,...,36.732251,RUBE 1,,,,,,,,
2,node/472084262,Bab Ezzouar - Le Pont باب الزوار - الجسر,باب الزوار - الجسر,ترامواي الجزائر,Q1688502,SETRAM,Q22688489,stop_position,tram_stop,yes,...,36.724346,,,,,,,,,
3,node/472084274,Université de Bab Ezzouar,جامعة باب الزوار,ترامواي الجزائر,Q1688502,SETRAM,Q22688489,stop_position,tram_stop,yes,...,36.720810,,جامعة العلوم و التكنولوجيا هواري بومدين,Université USTHB,,,,,,
4,node/472084279,Cité Rabia,حي رابية,ترامواي الجزائر,Q1688502,SETRAM,Q22688489,stop_position,tram_stop,yes,...,36.724481,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,node/4377305015,Dergana Centre,درڨانة وسط,ترامواي الجزائر,Q1688502,SETRAM,Q22688489,stop_position,tram_stop,yes,...,36.772017,,,,Dergana Centre,,Dergana Center,local knowledge,,yes
72,node/4377305016,Faculté Biomédicale,,ترامواي الجزائر,Q1688502,SETRAM,Q22688489,stop_position,tram_stop,yes,...,36.777736,,,,,,,local knowledge,,
73,node/7301592854,Ruisseau,الرويسو,ترامواي الجزائر,Q1688502,SETRAM,Q22688489,stop_position,tram_stop,yes,...,36.742919,,رويسو,,,,,,,
74,node/8292893940,Ben Redouan,بن رضوان,ترامواي الجزائر,Q1688502,SETRAM,Q22688489,stop_position,tram_stop,yes,...,36.755438,,,,Ben Redouan,,Ben Redouan,,,


In [22]:
dataset = df[['name', 'lon', 'lat']].rename(columns={'lon': 'longitude', 'lat': 'latitude'})


In [23]:
print(dataset.head())

                                       name  longitude   latitude
0                     Cité Mokhtar Zerhouni   3.174324  36.729478
1                Cité Universitaire - CUB 1   3.184065  36.732251
2  Bab Ezzouar - Le Pont باب الزوار - الجسر   3.184784  36.724346
3                 Université de Bab Ezzouar   3.179575  36.720810
4                                Cité Rabia   3.176970  36.724481


In [26]:
dataset.to_csv(r'C:\Users\user\OneDrive\Desktop\ibtikar_backend-main\Genetic_Algo\tram_points.csv', index=False)

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

def scrape_algiers_population():
    """
    Scrape population data for Algiers communes from citypopulation.de
    """
    url = "https://www.citypopulation.de/en/algeria/admin/16__el_djaza%C3%AFr/"
    
    print("Fetching data from citypopulation.de...")
    
    # Send GET request with headers to avoid blocking
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Parse HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all tables with class 'data'
        tables = soup.find_all('table', {'class': 'data'})
        
        if not tables:
            print("Could not find any data tables on the page")
            return None
        
        # The main table with commune data is usually the largest one
        # Let's find all rows across all tables
        all_data = []
        
        for table in tables:
            # Find all rows in the table
            rows = table.find_all('tr')
            print(f"Found {len(rows)} rows in a table")
            
            # Skip header rows - we'll process each row and check if it has commune data
            for row in rows:
                # Check if this is a data row (not a header)
                cells = row.find_all(['td', 'th'])
                
                if len(cells) >= 5:  # We need at least 5 columns (name, status, native, pop1998, pop2008)
                    # Skip if this is a header row
                    if row.find('th'):
                        continue
                    
                    # Extract place name (first cell)
                    name_cell = cells[0]
                    
                    # Skip the province row (we want communes)
                    cell_text = name_cell.get_text().strip()
                    if "Province" in str(row):
                        continue
                        
                    # Get place name
                    place_name = None
                    name_link = name_cell.find('a')
                    if name_link:
                        place_name = name_link.get_text().strip()
                    else:
                        place_name = cell_text
                    
                    # Skip empty rows or header rows
                    if not place_name or place_name == "Name":
                        continue
                    
                    # Extract status (usually "Commune")
                    status = ""
                    if len(cells) > 1:
                        status = cells[1].get_text().strip()
                    
                    # Extract native name
                    native_name = ""
                    if len(cells) > 2:
                        native_name = cells[2].get_text().strip()
                    
                    # Extract population data
                    population_1998 = None
                    population_2008 = None
                    
                    # Population 1998 (usually 4th column)
                    if len(cells) > 3:
                        pop_text = cells[3].get_text().strip()
                        numbers = re.findall(r'[\d,]+', pop_text)
                        if numbers:
                            clean_number = numbers[0].replace(',', '')
                            if clean_number.isdigit():
                                population_1998 = int(clean_number)
                    
                    # Population 2008 (usually 5th column)
                    if len(cells) > 4:
                        pop_text = cells[4].get_text().strip()
                        numbers = re.findall(r'[\d,]+', pop_text)
                        if numbers:
                            clean_number = numbers[0].replace(',', '')
                            if clean_number.isdigit():
                                population_2008 = int(clean_number)
                    
                    # Add to our data list
                    all_data.append({
                        'place_name': place_name,
                        'status': status,
                        'native_name': native_name,
                        'population_1998': population_1998,
                        'population_2008': population_2008
                    })
        
        print(f"Successfully extracted data for {len(all_data)} places")
        
        # Create DataFrame
        df = pd.DataFrame(all_data)
        
        # Save to CSV
        csv_filename = 'algiers_population_data.csv'
        df.to_csv(csv_filename, index=False, encoding='utf-8')
        
        print(f"Data saved to {csv_filename}")
        print("\nFirst few rows of the dataset:")
        print(df.head(10))
        
        print(f"\nDataset summary:")
        print(f"Total places: {len(df)}")
        print(f"Places with 1998 data: {df['population_1998'].notna().sum()}")
        print(f"Places with 2008 data: {df['population_2008'].notna().sum()}")
        
        return df
        
    except requests.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return None
    except Exception as e:
        print(f"Error processing data: {e}")
        import traceback
        traceback.print_exc()
        return None

# Run the scraper
if __name__ == "__main__":
    df = scrape_algiers_population()
    
    if df is not None:
        print("\n" + "="*50)
        print("SCRAPING COMPLETED SUCCESSFULLY!")
        print("="*50)
        print(f"CSV file 'algiers_population_data.csv' has been created with {len(df)} records")
    else:
        print("Scraping failed. Please check the error messages above.")


Fetching data from citypopulation.de...
Found 60 rows in a table
Found 3 rows in a table
Found 4 rows in a table
Found 10 rows in a table
Found 4 rows in a table
Successfully extracted data for 58 places
Data saved to algiers_population_data.csv

First few rows of the dataset:
     place_name   status  native_name  population_1998  population_2008
0    Aïn Benian  Commune  عين البنيان            52343            68354
1      Aïn Taya  Commune     عين طاية            29515            34501
2   Baba Hassen  Commune     بابا حسن            13827            23756
3   Bab El Oued  Commune   باب الوادي            87557            64732
4   Bab Ezzouar  Commune   باب الزوار            92157            96597
5   Bachdjerrah  Commune     باش جراح            90073            93289
6        Baraki  Commune        براقي            95247           116375
7    Ben Aknoun  Commune     بن عكنون            19404            18838
8  Beni Messous  Commune     بني مسوس            17490            36191
9 

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

def scrape_algiers_population():
    """
    Scrape population data for Algiers communes from citypopulation.de
    """
    url = "https://www.citypopulation.de/en/algeria/admin/16__el_djaza%C3%AFr/"
    
    print("Fetching data from citypopulation.de...")
    
    # Send GET request with headers to avoid blocking
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Parse HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all tables with class 'data'
        tables = soup.find_all('table', {'class': 'data'})
        
        if not tables:
            print("Could not find any data tables on the page")
            return None
        
        # The main table with commune data is usually the largest one
        # Let's find all rows across all tables
        all_data = []
        
        for table in tables:
            # Find all rows in the table
            rows = table.find_all('tr')
            print(f"Found {len(rows)} rows in a table")
            
            # Skip header rows - we'll process each row and check if it has commune data
            for row in rows:
                # Check if this is a data row (not a header)
                cells = row.find_all(['td', 'th'])
                
                if len(cells) >= 5:  # We need at least 5 columns (name, status, native, pop1998, pop2008)
                    # Skip if this is a header row
                    if row.find('th'):
                        continue
                    
                    # Extract place name (first cell)
                    name_cell = cells[0]
                    
                    # Skip the province row (we want communes)
                    cell_text = name_cell.get_text().strip()
                    if "Province" in str(row):
                        continue
                        
                    # Get place name
                    place_name = None
                    name_link = name_cell.find('a')
                    if name_link:
                        place_name = name_link.get_text().strip()
                    else:
                        place_name = cell_text
                    
                    # Skip empty rows or header rows
                    if not place_name or place_name == "Name":
                        continue
                    
                    # Extract status (usually "Commune")
                    status = ""
                    if len(cells) > 1:
                        status = cells[1].get_text().strip()
                    
                    # Extract native name
                    native_name = ""
                    if len(cells) > 2:
                        native_name = cells[2].get_text().strip()
                    
                    # Extract population data
                    population_1998 = None
                    population_2008 = None
                    
                    # Population 1998 (usually 4th column)
                    if len(cells) > 3:
                        pop_text = cells[3].get_text().strip()
                        numbers = re.findall(r'[\d,]+', pop_text)
                        if numbers:
                            clean_number = numbers[0].replace(',', '')
                            if clean_number.isdigit():
                                population_1998 = int(clean_number)
                    
                    # Population 2008 (usually 5th column)
                    if len(cells) > 4:
                        pop_text = cells[4].get_text().strip()
                        numbers = re.findall(r'[\d,]+', pop_text)
                        if numbers:
                            clean_number = numbers[0].replace(',', '')
                            if clean_number.isdigit():
                                population_2008 = int(clean_number)
                    
                    # Add to our data list
                    all_data.append({
                        'place_name': place_name,
                        'status': status,
                        'native_name': native_name,
                        'population_1998': population_1998,
                        'population_2008': population_2008
                    })
        
        print(f"Successfully extracted data for {len(all_data)} places")
        
        # Create DataFrame
        df = pd.DataFrame(all_data)
        
        # Save to CSV
        csv_filename = 'algiers_population_data.csv'
        df.to_csv(csv_filename, index=False, encoding='utf-8')
        
        print(f"Data saved to {csv_filename}")
        print("\nFirst few rows of the dataset:")
        print(df.head(10))
        
        print(f"\nDataset summary:")
        print(f"Total places: {len(df)}")
        print(f"Places with 1998 data: {df['population_1998'].notna().sum()}")
        print(f"Places with 2008 data: {df['population_2008'].notna().sum()}")
        
        return df
        
    except requests.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return None
    except Exception as e:
        print(f"Error processing data: {e}")
        import traceback
        traceback.print_exc()
        return None

# Run the scraper
if __name__ == "__main__":
    df = scrape_algiers_population()
    
    if df is not None:
        print("\n" + "="*50)
        print("SCRAPING COMPLETED SUCCESSFULLY!")
        print("="*50)
        print(f"CSV file 'algiers_population_data.csv' has been created with {len(df)} records")
    else:
        print("Scraping failed. Please check the error messages above.")


Fetching data from citypopulation.de...
Found 60 rows in a table
Found 3 rows in a table
Found 4 rows in a table
Found 10 rows in a table
Found 4 rows in a table
Successfully extracted data for 58 places
Data saved to algiers_population_data.csv

First few rows of the dataset:
     place_name   status  native_name  population_1998  population_2008
0    Aïn Benian  Commune  عين البنيان            52343            68354
1      Aïn Taya  Commune     عين طاية            29515            34501
2   Baba Hassen  Commune     بابا حسن            13827            23756
3   Bab El Oued  Commune   باب الوادي            87557            64732
4   Bab Ezzouar  Commune   باب الزوار            92157            96597
5   Bachdjerrah  Commune     باش جراح            90073            93289
6        Baraki  Commune        براقي            95247           116375
7    Ben Aknoun  Commune     بن عكنون            19404            18838
8  Beni Messous  Commune     بني مسوس            17490            36191
9 

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

def scrape_algiers_population():
    """
    Scrape population data for Algiers communes from citypopulation.de
    """
    url = "https://www.citypopulation.de/en/algeria/admin/16__el_djaza%C3%AFr/"
    
    print("Fetching data from citypopulation.de...")
    
    # Send GET request with headers to avoid blocking
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Parse HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all tables with class 'data'
        tables = soup.find_all('table', {'class': 'data'})
        
        if not tables:
            print("Could not find any data tables on the page")
            return None
        
        # The main table with commune data is usually the largest one
        # Let's find all rows across all tables
        all_data = []
        
        for table in tables:
            # Find all rows in the table
            rows = table.find_all('tr')
            print(f"Found {len(rows)} rows in a table")
            
            # Skip header rows - we'll process each row and check if it has commune data
            for row in rows:
                # Check if this is a data row (not a header)
                cells = row.find_all(['td', 'th'])
                
                if len(cells) >= 5:  # We need at least 5 columns (name, status, native, pop1998, pop2008)
                    # Skip if this is a header row
                    if row.find('th'):
                        continue
                    
                    # Extract place name (first cell)
                    name_cell = cells[0]
                    
                    # Skip the province row (we want communes)
                    cell_text = name_cell.get_text().strip()
                    if "Province" in str(row):
                        continue
                        
                    # Get place name
                    place_name = None
                    name_link = name_cell.find('a')
                    if name_link:
                        place_name = name_link.get_text().strip()
                    else:
                        place_name = cell_text
                    
                    # Skip empty rows or header rows
                    if not place_name or place_name == "Name":
                        continue
                    
                    # Extract status (usually "Commune")
                    status = ""
                    if len(cells) > 1:
                        status = cells[1].get_text().strip()
                    
                    # Extract native name
                    native_name = ""
                    if len(cells) > 2:
                        native_name = cells[2].get_text().strip()
                    
                    # Extract population data
                    population_1998 = None
                    population_2008 = None
                    
                    # Population 1998 (usually 4th column)
                    if len(cells) > 3:
                        pop_text = cells[3].get_text().strip()
                        numbers = re.findall(r'[\d,]+', pop_text)
                        if numbers:
                            clean_number = numbers[0].replace(',', '')
                            if clean_number.isdigit():
                                population_1998 = int(clean_number)
                    
                    # Population 2008 (usually 5th column)
                    if len(cells) > 4:
                        pop_text = cells[4].get_text().strip()
                        numbers = re.findall(r'[\d,]+', pop_text)
                        if numbers:
                            clean_number = numbers[0].replace(',', '')
                            if clean_number.isdigit():
                                population_2008 = int(clean_number)
                    
                    # Add to our data list
                    all_data.append({
                        'place_name': place_name,
                        'status': status,
                        'native_name': native_name,
                        'population_1998': population_1998,
                        'population_2008': population_2008
                    })
        
        print(f"Successfully extracted data for {len(all_data)} places")
        
        # Create DataFrame
        df = pd.DataFrame(all_data)
        
        # Save to CSV
        csv_filename = 'algiers_population_data.csv'
        df.to_csv(csv_filename, index=False, encoding='utf-8')
        
        print(f"Data saved to {csv_filename}")
        print("\nFirst few rows of the dataset:")
        print(df.head(10))
        
        print(f"\nDataset summary:")
        print(f"Total places: {len(df)}")
        print(f"Places with 1998 data: {df['population_1998'].notna().sum()}")
        print(f"Places with 2008 data: {df['population_2008'].notna().sum()}")
        
        return df
        
    except requests.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return None
    except Exception as e:
        print(f"Error processing data: {e}")
        import traceback
        traceback.print_exc()
        return None

# Run the scraper
if __name__ == "__main__":
    df = scrape_algiers_population()
    
    if df is not None:
        print("\n" + "="*50)
        print("SCRAPING COMPLETED SUCCESSFULLY!")
        print("="*50)
        print(f"CSV file 'algiers_population_data.csv' has been created with {len(df)} records")
    else:
        print("Scraping failed. Please check the error messages above.")


Fetching data from citypopulation.de...
Found 60 rows in a table
Found 3 rows in a table
Found 4 rows in a table
Found 10 rows in a table
Found 4 rows in a table
Successfully extracted data for 58 places
Data saved to algiers_population_data.csv

First few rows of the dataset:
     place_name   status  native_name  population_1998  population_2008
0    Aïn Benian  Commune  عين البنيان            52343            68354
1      Aïn Taya  Commune     عين طاية            29515            34501
2   Baba Hassen  Commune     بابا حسن            13827            23756
3   Bab El Oued  Commune   باب الوادي            87557            64732
4   Bab Ezzouar  Commune   باب الزوار            92157            96597
5   Bachdjerrah  Commune     باش جراح            90073            93289
6        Baraki  Commune        براقي            95247           116375
7    Ben Aknoun  Commune     بن عكنون            19404            18838
8  Beni Messous  Commune     بني مسوس            17490            36191
9 

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

def scrape_algiers_population():
    """
    Scrape population data for Algiers communes from citypopulation.de
    """
    url = "https://www.citypopulation.de/en/algeria/admin/16__el_djaza%C3%AFr/"
    
    print("Fetching data from citypopulation.de...")
    
    # Send GET request with headers to avoid blocking
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Parse HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all tables with class 'data'
        tables = soup.find_all('table', {'class': 'data'})
        
        if not tables:
            print("Could not find any data tables on the page")
            return None
        
        # The main table with commune data is usually the largest one
        # Let's find all rows across all tables
        all_data = []
        
        for table in tables:
            # Find all rows in the table
            rows = table.find_all('tr')
            print(f"Found {len(rows)} rows in a table")
            
            # Skip header rows - we'll process each row and check if it has commune data
            for row in rows:
                # Check if this is a data row (not a header)
                cells = row.find_all(['td', 'th'])
                
                if len(cells) >= 5:  # We need at least 5 columns (name, status, native, pop1998, pop2008)
                    # Skip if this is a header row
                    if row.find('th'):
                        continue
                    
                    # Extract place name (first cell)
                    name_cell = cells[0]
                    
                    # Skip the province row (we want communes)
                    cell_text = name_cell.get_text().strip()
                    if "Province" in str(row):
                        continue
                        
                    # Get place name
                    place_name = None
                    name_link = name_cell.find('a')
                    if name_link:
                        place_name = name_link.get_text().strip()
                    else:
                        place_name = cell_text
                    
                    # Skip empty rows or header rows
                    if not place_name or place_name == "Name":
                        continue
                    
                    # Extract status (usually "Commune")
                    status = ""
                    if len(cells) > 1:
                        status = cells[1].get_text().strip()
                    
                    # Extract native name
                    native_name = ""
                    if len(cells) > 2:
                        native_name = cells[2].get_text().strip()
                    
                    # Extract population data
                    population_1998 = None
                    population_2008 = None
                    
                    # Population 1998 (usually 4th column)
                    if len(cells) > 3:
                        pop_text = cells[3].get_text().strip()
                        numbers = re.findall(r'[\d,]+', pop_text)
                        if numbers:
                            clean_number = numbers[0].replace(',', '')
                            if clean_number.isdigit():
                                population_1998 = int(clean_number)
                    
                    # Population 2008 (usually 5th column)
                    if len(cells) > 4:
                        pop_text = cells[4].get_text().strip()
                        numbers = re.findall(r'[\d,]+', pop_text)
                        if numbers:
                            clean_number = numbers[0].replace(',', '')
                            if clean_number.isdigit():
                                population_2008 = int(clean_number)
                    
                    # Add to our data list
                    all_data.append({
                        'place_name': place_name,
                        'status': status,
                        'native_name': native_name,
                        'population_1998': population_1998,
                        'population_2008': population_2008
                    })
        
        print(f"Successfully extracted data for {len(all_data)} places")
        
        # Create DataFrame
        df = pd.DataFrame(all_data)
        
        # Save to CSV
        csv_filename = 'algiers_population_data.csv'
        df.to_csv(csv_filename, index=False, encoding='utf-8')
        
        print(f"Data saved to {csv_filename}")
        print("\nFirst few rows of the dataset:")
        print(df.head(10))
        
        print(f"\nDataset summary:")
        print(f"Total places: {len(df)}")
        print(f"Places with 1998 data: {df['population_1998'].notna().sum()}")
        print(f"Places with 2008 data: {df['population_2008'].notna().sum()}")
        
        return df
        
    except requests.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return None
    except Exception as e:
        print(f"Error processing data: {e}")
        import traceback
        traceback.print_exc()
        return None

# Run the scraper
if __name__ == "__main__":
    df = scrape_algiers_population()
    
    if df is not None:
        print("\n" + "="*50)
        print("SCRAPING COMPLETED SUCCESSFULLY!")
        print("="*50)
        print(f"CSV file 'algiers_population_data.csv' has been created with {len(df)} records")
    else:
        print("Scraping failed. Please check the error messages above.")


Fetching data from citypopulation.de...
Found 60 rows in a table
Found 3 rows in a table
Found 4 rows in a table
Found 10 rows in a table
Found 4 rows in a table
Successfully extracted data for 58 places
Data saved to algiers_population_data.csv

First few rows of the dataset:
     place_name   status  native_name  population_1998  population_2008
0    Aïn Benian  Commune  عين البنيان            52343            68354
1      Aïn Taya  Commune     عين طاية            29515            34501
2   Baba Hassen  Commune     بابا حسن            13827            23756
3   Bab El Oued  Commune   باب الوادي            87557            64732
4   Bab Ezzouar  Commune   باب الزوار            92157            96597
5   Bachdjerrah  Commune     باش جراح            90073            93289
6        Baraki  Commune        براقي            95247           116375
7    Ben Aknoun  Commune     بن عكنون            19404            18838
8  Beni Messous  Commune     بني مسوس            17490            36191
9 

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

def scrape_algiers_population():
    """
    Scrape population data for Algiers communes from citypopulation.de
    """
    url = "https://www.citypopulation.de/en/algeria/admin/16__el_djaza%C3%AFr/"
    
    print("Fetching data from citypopulation.de...")
    
    # Send GET request with headers to avoid blocking
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Parse HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all tables with class 'data'
        tables = soup.find_all('table', {'class': 'data'})
        
        if not tables:
            print("Could not find any data tables on the page")
            return None
        
        # The main table with commune data is usually the largest one
        # Let's find all rows across all tables
        all_data = []
        
        for table in tables:
            # Find all rows in the table
            rows = table.find_all('tr')
            print(f"Found {len(rows)} rows in a table")
            
            # Skip header rows - we'll process each row and check if it has commune data
            for row in rows:
                # Check if this is a data row (not a header)
                cells = row.find_all(['td', 'th'])
                
                if len(cells) >= 5:  # We need at least 5 columns (name, status, native, pop1998, pop2008)
                    # Skip if this is a header row
                    if row.find('th'):
                        continue
                    
                    # Extract place name (first cell)
                    name_cell = cells[0]
                    
                    # Skip the province row (we want communes)
                    cell_text = name_cell.get_text().strip()
                    if "Province" in str(row):
                        continue
                        
                    # Get place name
                    place_name = None
                    name_link = name_cell.find('a')
                    if name_link:
                        place_name = name_link.get_text().strip()
                    else:
                        place_name = cell_text
                    
                    # Skip empty rows or header rows
                    if not place_name or place_name == "Name":
                        continue
                    
                    # Extract status (usually "Commune")
                    status = ""
                    if len(cells) > 1:
                        status = cells[1].get_text().strip()
                    
                    # Extract native name
                    native_name = ""
                    if len(cells) > 2:
                        native_name = cells[2].get_text().strip()
                    
                    # Extract population data
                    population_1998 = None
                    population_2008 = None
                    
                    # Population 1998 (usually 4th column)
                    if len(cells) > 3:
                        pop_text = cells[3].get_text().strip()
                        numbers = re.findall(r'[\d,]+', pop_text)
                        if numbers:
                            clean_number = numbers[0].replace(',', '')
                            if clean_number.isdigit():
                                population_1998 = int(clean_number)
                    
                    # Population 2008 (usually 5th column)
                    if len(cells) > 4:
                        pop_text = cells[4].get_text().strip()
                        numbers = re.findall(r'[\d,]+', pop_text)
                        if numbers:
                            clean_number = numbers[0].replace(',', '')
                            if clean_number.isdigit():
                                population_2008 = int(clean_number)
                    
                    # Add to our data list
                    all_data.append({
                        'place_name': place_name,
                        'status': status,
                        'native_name': native_name,
                        'population_1998': population_1998,
                        'population_2008': population_2008
                    })
        
        print(f"Successfully extracted data for {len(all_data)} places")
        
        # Create DataFrame
        df = pd.DataFrame(all_data)
        
        # Save to CSV
        csv_filename = 'algiers_population_data.csv'
        df.to_csv(csv_filename, index=False, encoding='utf-8')
        
        print(f"Data saved to {csv_filename}")
        print("\nFirst few rows of the dataset:")
        print(df.head(10))
        
        print(f"\nDataset summary:")
        print(f"Total places: {len(df)}")
        print(f"Places with 1998 data: {df['population_1998'].notna().sum()}")
        print(f"Places with 2008 data: {df['population_2008'].notna().sum()}")
        
        return df
        
    except requests.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return None
    except Exception as e:
        print(f"Error processing data: {e}")
        import traceback
        traceback.print_exc()
        return None

# Run the scraper
if __name__ == "__main__":
    df = scrape_algiers_population()
    
    if df is not None:
        print("\n" + "="*50)
        print("SCRAPING COMPLETED SUCCESSFULLY!")
        print("="*50)
        print(f"CSV file 'algiers_population_data.csv' has been created with {len(df)} records")
    else:
        print("Scraping failed. Please check the error messages above.")


Fetching data from citypopulation.de...
Found 60 rows in a table
Found 3 rows in a table
Found 4 rows in a table
Found 10 rows in a table
Found 4 rows in a table
Successfully extracted data for 58 places
Data saved to algiers_population_data.csv

First few rows of the dataset:
     place_name   status  native_name  population_1998  population_2008
0    Aïn Benian  Commune  عين البنيان            52343            68354
1      Aïn Taya  Commune     عين طاية            29515            34501
2   Baba Hassen  Commune     بابا حسن            13827            23756
3   Bab El Oued  Commune   باب الوادي            87557            64732
4   Bab Ezzouar  Commune   باب الزوار            92157            96597
5   Bachdjerrah  Commune     باش جراح            90073            93289
6        Baraki  Commune        براقي            95247           116375
7    Ben Aknoun  Commune     بن عكنون            19404            18838
8  Beni Messous  Commune     بني مسوس            17490            36191
9 

In [30]:
final = pd.read_csv(r"C:\Users\user\OneDrive\Desktop\ibtikar_backend-main\Genetic_Algo\clean\final_population_and_credentials_dataset.csv")
final.head()

Unnamed: 0,commune_name,population,Latitude,Longitude
0,Ain Benian,68354,N 36° 48′ 10'',E 2° 55′ 18''
1,Ain Taya,34501,N 36° 47′ 36'',E 3° 17′ 13''
2,Baba Hassen,23756,N 36° 41′ 44'',E 2° 58′ 21''
3,Bab El Oued,64732,N 36° 47′ 53'',E 3° 1′ 27''
4,Bab Ezzouar,96597,N 36° 43′ 34'',E 3° 10′ 58''


In [31]:
import pandas as pd
import re

def dms_to_decimal(dms_string):
    """
    Convert degrees, minutes, seconds to decimal degrees
    Example: "N 36° 48′ 10''" -> 36.8027777778
    """
    # Extract direction (N/S/E/W)
    direction = dms_string.strip()[0]
    
    # Extract numbers using regex
    numbers = re.findall(r'\d+', dms_string)
    
    if len(numbers) != 3:
        return None
    
    degrees = int(numbers[0])
    minutes = int(numbers[1])
    seconds = int(numbers[2])
    
    # Convert to decimal
    decimal = degrees + minutes/60 + seconds/3600
    
    # Apply direction (negative for South and West)
    if direction in ['S', 'W']:
        decimal = -decimal
    
    return decimal

# Read the dataset
final = pd.read_csv(r"C:\Users\user\OneDrive\Desktop\ibtikar_backend-main\Genetic_Algo\clean\final_population_and_credentials_dataset.csv")

# Convert coordinates
final['Latitude_Decimal'] = final['Latitude'].apply(dms_to_decimal)
final['Longitude_Decimal'] = final['Longitude'].apply(dms_to_decimal)

# Reorder columns to match your desired format
final_converted = final[['commune_name', 'Longitude_Decimal', 'Latitude_Decimal', 'population']].copy()

# Rename columns to match your example
final_converted.columns = ['commune_name', 'longitude', 'latitude', 'population']

# Display the converted data
print("Original format:")
print(final[['commune_name', 'population', 'Latitude', 'Longitude']].head())
print("\nConverted format:")
print(final_converted.head())

# Save the converted dataset
output_path = r"C:\Users\user\OneDrive\Desktop\ibtikar_backend-main\Genetic_Algo\clean\final_population_and_credentials_dataset_converted.csv"
final_converted.to_csv(output_path, index=False)
print(f"\nConverted dataset saved to: {output_path}")

# Verify conversion with first row as example
first_row = final.iloc[0]
print(f"\nVerification - First row conversion:")
print(f"Original: {first_row['commune_name']} - Lat: {first_row['Latitude']}, Lon: {first_row['Longitude']}")
print(f"Converted: {final_converted.iloc[0]['commune_name']} - Lat: {final_converted.iloc[0]['latitude']}, Lon: {final_converted.iloc[0]['longitude']}")

Original format:
  commune_name  population        Latitude      Longitude
0   Ain Benian       68354  N 36° 48′ 10''  E 2° 55′ 18''
1     Ain Taya       34501  N 36° 47′ 36''  E 3° 17′ 13''
2  Baba Hassen       23756  N 36° 41′ 44''  E 2° 58′ 21''
3  Bab El Oued       64732  N 36° 47′ 53''   E 3° 1′ 27''
4  Bab Ezzouar       96597  N 36° 43′ 34''  E 3° 10′ 58''

Converted format:
  commune_name  longitude   latitude  population
0   Ain Benian   2.921667  36.802778       68354
1     Ain Taya   3.286944  36.793333       34501
2  Baba Hassen   2.972500  36.695556       23756
3  Bab El Oued   3.024167  36.798056       64732
4  Bab Ezzouar   3.182778  36.726111       96597

Converted dataset saved to: C:\Users\user\OneDrive\Desktop\ibtikar_backend-main\Genetic_Algo\clean\final_population_and_credentials_dataset_converted.csv

Verification - First row conversion:
Original: Ain Benian - Lat: N 36° 48′ 10'', Lon: E 2° 55′ 18''
Converted: Ain Benian - Lat: 36.80277777777778, Lon: 2.92166666666

In [33]:
finall = pd.read_csv(r"C:\Users\user\OneDrive\Desktop\ibtikar_backend-main\Genetic_Algo\study_points.csv")
finall.head()

Unnamed: 0,@id,name,name:ar,network,network:wikidata,operator,operator:wikidata,public_transport,railway,tram,...,old_short_name,addr:city:ar,name:es,facebook,alt_name:en,contact:email,contact:website,material,name:fa,short_name:fr
0,node/452418787,Cité Mokhtar Zerhouni,حي مختار زرهوني,ترامواي الجزائر,Q1688502,SETRAM,Q22688489,stop_position,tram_stop,yes,...,,,,,,,,,,
1,node/472084248,Cité Universitaire - CUB 1,الإقامة الجامعية - ح ج ب 1,ترامواي الجزائر,Q1688502,SETRAM,Q22688489,stop_position,tram_stop,yes,...,,,,,,,,,,
2,node/472084262,Bab Ezzouar - Le Pont باب الزوار - الجسر,باب الزوار - الجسر,ترامواي الجزائر,Q1688502,SETRAM,Q22688489,stop_position,tram_stop,yes,...,,,,,,,,,,
3,node/472084274,Université de Bab Ezzouar,جامعة باب الزوار,ترامواي الجزائر,Q1688502,SETRAM,Q22688489,stop_position,tram_stop,yes,...,,,,,,,,,,
4,node/472084279,Cité Rabia,حي رابية,ترامواي الجزائر,Q1688502,SETRAM,Q22688489,stop_position,tram_stop,yes,...,,,,,,,,,,


In [38]:
import pandas as pd

# Load your CSV
finall = pd.read_csv(r"C:\Users\user\OneDrive\Desktop\ibtikar_backend-main\Genetic_Algo\study_points.csv")

# Keep only the desired columns
cleaned = finall[['name', 'lon', 'lat']].dropna()

cleaned.rename(columns={'name': 'commune_name', 'lon': 'longitude', 'lat': 'latitude'}, inplace=True)

# Save cleaned data to a new file (optional)
cleaned.to_csv(r"C:\Users\user\OneDrive\Desktop\ibtikar_backend-main\Genetic_Algo\clean\cleaned_study_points.csv", index=False)

# Preview
print(cleaned.head())


                               commune_name  longitude   latitude
0                     Cité Mokhtar Zerhouni   3.174324  36.729478
1                Cité Universitaire - CUB 1   3.184065  36.732251
2  Bab Ezzouar - Le Pont باب الزوار - الجسر   3.184784  36.724346
3                 Université de Bab Ezzouar   3.179575  36.720810
4                                Cité Rabia   3.176970  36.724481
