In [None]:
##Coordinate values
class GeoFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, center_lat=-6.1630, center_lon=35.7516):
        self.n_clusters = n_clusters
        self.center_lat = center_lat
        self.center_lon = center_lon
        self.kmeans = None

    def haversine(self, lon1, lat1, lon2, lat2):
        # Convert decimal degrees to radians
        lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = sin(dlat/2)**2 + cos(lat1)*cos(lat2)*sin(dlon/2)**2
        c = 2 * asin(sqrt(a))
        return 6371 * c  # Radius of Earth in km

    def fit(self, X, y=None):
        coords = X[['latitude', 'longitude']]
        self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=42, n_init=10)
        self.kmeans.fit(coords)
        return self

    def transform(self, X):
        X = X.copy()
        
        # Cluster
        X['geo_cluster'] = self.kmeans.predict(X[['latitude', 'longitude']])

        # Has GPS flag
        X['has_gps'] = ((X['latitude'] != 0) & (X['longitude'] != 0)).astype(int)
# Distance to central point
        X['distance_to_center'] = X.apply(
            lambda row: self.haversine(self.center_lon, self.center_lat, row['longitude'], row['latitude']),
            axis=1
        )

        return X[['geo_cluster', 'has_gps', 'distance_to_center']]

In [None]:
#preprocessing pipeline

# Custom Transformer to extract year from 'date_recorded'
class DateEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        X['recorded_year'] = pd.to_datetime(X['date_recorded']).dt.year
        return X.drop(columns=['date_recorded'])

# Custom Transformer to drop constant or unnecessary columns
class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.drop(columns=self.columns_to_drop)

# Columns setup
binary_cols = ['permit', 'public_meeting']
low_card_cat = ['basin', 'region', 'extraction_type_class', 'payment', 'management_group', 'water_quality']
mid_card_cat = ['extraction_type', 'source', 'waterpoint_type', 'quantity_group']
high_card_cat = ['funder', 'installer']
numerical_cols = ['amount_tsh', 'gps_height', 'population', 'construction_year']

# Drop columns: noisy, high cardinality, GPS coords, constant
cols_to_drop = ['scheme_name', 'wpt_name', 'recorded_by', 'subvillage', 'ward']
# Define transformers
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('to_numeric', FunctionTransformer(lambda x: x.apply(
        lambda col: col.map({'True': 1, 'False': 0, 'Unknown': -1}).fillna(-1)
    )))
])

low_card_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

mid_card_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

high_card_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('freq_encode', FunctionTransformer(
        lambda x: pd.DataFrame(x).apply(lambda col: col.map(pd.Series(col).value_counts())),
        validate=False  # important for non-NumPy inputs
    ))
])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

geo_pipeline = Pipeline([
    ('geo_features', GeoFeatureEngineer(n_clusters=10))
])
column_transformer = ColumnTransformer(transformers=[
    ('binary', binary_transformer, binary_cols),
    ('low_card', low_card_transformer, low_card_cat),
    ('mid_card', mid_card_transformer, mid_card_cat),
    ('high_card', high_card_transformer, high_card_cat),
    ('numeric', numeric_transformer, numerical_cols),
    ('geo', GeoFeatureEngineer(n_clusters=10), ['latitude', 'longitude'])
])

In [None]:
pip install geopy

In [None]:
from geopy.geocoders import Nominatim
from tqdm import tqdm
geolocator = Nominatim(user_agent="my_app")
regions = []
for lat, lon in tqdm(zip(df['latitude'], df['longitude'])):
    try:
        location = geolocator.reverse(f"{lat}, {lon}", language='en', timeout=10)
        if location is not None and location.raw.get('address'):
            address = location.raw['address']
            region = address.get('state_district') or address.get('county') or address.get('state')
        else:
            region = None
    except Exception as e:
        region = None
    regions.append(region)