# Cleaning rules:

An aid call is valid only if it's satisfying one of the following criteria: 
- Has a valid coords:  valid longitude, latitude
- Has a non empty city and location


In [34]:
import json

def validate_latitude_longitude(latitude, longitude):
    try:
        latitude = float(latitude)
        longitude = float(longitude)
        if -90 <= latitude <= 90 and -180 <= longitude <= 180:
            return True
        else:
            return False
    except ValueError:
        return False
    
def deduplicate_list(data_list):
    """Deduplicating the elements based on location, and exact_position
    """
    unique_location_elements = {}
    deduplicated_list = []

    for element in data_list:
        location = element["data"]["location"]
        position = element["data"]["exact_position"]

        if location not in unique_location_elements:
            unique_location_elements[location] = element
        else:
            existing_element = unique_location_elements[location]
            if existing_element["data"]["exact_position"] != position:
                if len(element["data"].keys()) > len(existing_element["data"].keys()):
                    unique_location_elements[location] = element

    deduplicated_list = list(unique_location_elements.values())

    return deduplicated_list

with open('helpsV2.json', 'r') as file:
    data = json.load(file)



In [49]:
valid_elements = []
invalid_elements = []

# first deduplicating elements based on same location or same geographical coords
deduplicated_data = deduplicate_list(data)

# Then checking the quality of the data
for element in deduplicated_data:
    element_data = element.get("data", {})
    exact_position = element_data.get("exact_position", {})
    
    if exact_position:
    
        latitude = exact_position.get("latitude")
        longitude = exact_position.get("longitude")
        
        if validate_latitude_longitude(latitude, longitude):
            valid_elements.append(element)
            continue
    
    if element_data["city"].strip() not in [None, ""] and element_data["location"].strip() not in [None, ""]:
        valid_elements.append(element)
        continue
    
    invalid_elements.append(element)

    

In [53]:
print(f"Number of cleaned elements {len(valid_elements)} vs number of initial elements: {len(data)}. Saving result")

output_file = "helpsV2cleaned.json"

# Write the list of dictionaries to a JSON file
with open(output_file, "w", encoding="utf-8") as json_file:
    json.dump(valid_elements, json_file, ensure_ascii=False, indent=4)

Number of cleaned elements 94 vs number of initial elements: 97. Saving result


# Cleaning the CSV calls

- Normalizing location and deduplicate it

In [54]:
import pandas as pd
import re

df = pd.read_csv('requests.csv')

keywords = [
    'جماعة',
    'قيادة',
    'دوار',
    'دواوير',
    'إقليم',
    'نواحي',
    
]

def clean_location(input_string):
    # Using regular expressions to match Arabic and Latin letters only
    pattern = r'[^a-zA-Z\u0600-\u06FF\s]' 
    cleaned_string = re.sub(pattern, '', input_string)
    
    for k in keywords:
        cleaned_string = cleaned_string.replace(k, '')
        
    return cleaned_string.strip()

def extract_and_normalize(location):
    """Extracting and normalizing village names to deduplication
    """
    if isinstance(location, str):
        
        cleaned_location = clean_location(location)
        parts = cleaned_location.split(' ')
        parts = [part.strip() for part in parts]  
        parts.sort() 
        return ', '.join(parts)
    else:
        return None

df['normalized_location'] = df['  لأي  جماعة / قيادة / دوار تنتمون ؟'].apply(extract_and_normalize)
df = df.dropna(subset=['normalized_location']) # Ignoring village names with non-string values
df.drop(columns=['  لأي  جماعة / قيادة / دوار تنتمون ؟'], inplace=True)
df.sort_values(by='Horodateur', ascending=False, inplace=True)

deduplicated_df = pd.DataFrame()

for normalized_location in df['normalized_location'].drop_duplicates():
    mask = df['normalized_location'] == normalized_location
    most_recent_record = df[mask].iloc[0]
    deduplicated_df = pd.concat([deduplicated_df, most_recent_record.to_frame().T], ignore_index=True)


deduplicated_df.reset_index(drop=True, inplace=True)
deduplicated_df.to_csv('deduplicated_output.csv', index=False)
