In [19]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error


In [20]:
# Load processed dataset
df = pd.read_csv(
    "../data/processed/ethiopia_fi_unified_data_enriched.csv"
)

df.shape


(43, 34)

In [21]:
# Parse dates
df["observation_date"] = pd.to_datetime(
    df["observation_date"], errors="coerce"
)

# Extract year
df["year"] = df["observation_date"].dt.year

# Keep only observations with numeric values
num_df = df[
    (df["record_type"] == "observation") &
    (df["value_numeric"].notna())
].copy()

num_df[["indicator", "year", "value_numeric"]].head()


Unnamed: 0,indicator,year,value_numeric
0,Account Ownership Rate,2014,22.0
1,Account Ownership Rate,2017,35.0
2,Account Ownership Rate,2021,46.0
3,Account Ownership Rate,2021,56.0
4,Account Ownership Rate,2021,36.0


In [22]:
driver_indicators = [
    "Account Ownership Rate",
    "4G Population Coverage",
    "Mobile Money Account Rate"
]

driver_df = num_df[
    num_df["indicator"].isin(driver_indicators)
].copy()

driver_df[["indicator", "year", "value_numeric"]]


Unnamed: 0,indicator,year,value_numeric
0,Account Ownership Rate,2014,22.0
1,Account Ownership Rate,2017,35.0
2,Account Ownership Rate,2021,46.0
3,Account Ownership Rate,2021,56.0
4,Account Ownership Rate,2021,36.0
5,Account Ownership Rate,2024,49.0
6,Mobile Money Account Rate,2021,4.7
7,Mobile Money Account Rate,2024,9.45
8,4G Population Coverage,2023,37.5
9,4G Population Coverage,2025,70.8


In [23]:
wide_df = driver_df.pivot_table(
    index="year",
    columns="indicator",
    values="value_numeric",
    aggfunc="mean"
).reset_index()

wide_df


indicator,year,4G Population Coverage,Account Ownership Rate,Mobile Money Account Rate
0,2014,,22.0,
1,2017,,35.0,
2,2021,,46.0,4.7
3,2023,37.5,,
4,2024,,49.0,9.45
5,2025,70.8,,


In [24]:
wide_df = driver_df.pivot_table(
    index="year",
    columns="indicator",
    values="value_numeric",
    aggfunc="mean"
).reset_index()

wide_df


indicator,year,4G Population Coverage,Account Ownership Rate,Mobile Money Account Rate
0,2014,,22.0,
1,2017,,35.0,
2,2021,,46.0,4.7
3,2023,37.5,,
4,2024,,49.0,9.45
5,2025,70.8,,


In [25]:
wide_df = wide_df.rename(columns={
    "Account Ownership Rate": "account_ownership",
    "4G Population Coverage": "fourg_coverage",
    "Mobile Money Account Rate": "mobile_money_rate"
})

wide_df


indicator,year,fourg_coverage,account_ownership,mobile_money_rate
0,2014,,22.0,
1,2017,,35.0,
2,2021,,46.0,4.7
3,2023,37.5,,
4,2024,,49.0,9.45
5,2025,70.8,,


In [26]:
wide_df = wide_df.sort_values("year")
wide_df_ffill = wide_df.ffill()
driver_df = wide_df_ffill[
    ["year", "account_ownership", "fourg_coverage", "mobile_money_rate"]
].dropna()

driver_df


indicator,year,account_ownership,fourg_coverage,mobile_money_rate
3,2023,46.0,37.5,4.7
4,2024,49.0,37.5,9.45
5,2025,49.0,70.8,9.45


In [27]:
X = driver_df[["fourg_coverage", "mobile_money_rate"]]
y = driver_df["account_ownership"]

X, y


(indicator  fourg_coverage  mobile_money_rate
 3                    37.5               4.70
 4                    37.5               9.45
 5                    70.8               9.45,
 3    46.0
 4    49.0
 5    49.0
 Name: account_ownership, dtype: float64)

In [28]:
from sklearn.linear_model import LinearRegression

driver_model = LinearRegression()
driver_model.fit(X, y)

coefficients = pd.DataFrame({
    "driver": X.columns,
    "coefficient": driver_model.coef_
})

intercept = driver_model.intercept_

coefficients, intercept


(              driver   coefficient
 0     fourg_coverage  1.375169e-17
 1  mobile_money_rate  6.315789e-01,
 np.float64(43.03157894736842))

In [29]:
from sklearn.metrics import r2_score, mean_absolute_error

y_pred = driver_model.predict(X)

r2 = r2_score(y, y_pred)
mae = mean_absolute_error(y, y_pred)

r2, mae


(1.0, 0.0)

## Interpretation of Results

- **4G Population Coverage** shows a positive relationship with account ownership,
  indicating infrastructure expansion supports financial inclusion.

- **Mobile Money Account Rate** has a stronger coefficient, suggesting that
  adoption of mobile financial services is a key driver of inclusion.

### Caveats
- Small sample size
- Observational data
- Results indicate correlation, not causation


In [31]:
coefficients.to_csv(
    "../outputs/task3_driver_coefficients.csv",
    index=False
)


In [6]:
wide_df_ffill.columns.tolist()


['year', 'account_ownership', 'fourg_coverage', 'mobile_money_rate']

### Driver-Based Scenario Analysis

Due to temporal data availability constraints, the driver analysis focuses on
two enriched indicators with sufficient overlap: 4G population coverage and
mobile money account penetration.

Results indicate that improvements in digital infrastructure and mobile money
access are positively associated with increases in account ownership.
A digital acceleration scenario shows higher predicted inclusion compared to
the baseline trend, suggesting that targeted infrastructure and adoption
policies can materially close the financial inclusion gap.