In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

In [3]:
df=pd.read_excel("True_Palmpay_Account_Users.xlsx")
print(df.shape)
df.head(10)

(253, 20)


Unnamed: 0,state,lga_code,lga_name,ward_code,ward_name,area_name,population,palmpay_accounts,opay_accounts,moniepoint_accounts,device_used,source_of_income,bank_charges,palmpay_penetration,opay_penetration,moniepoint_penetration,multi_app_users,coverage_percent,Gender,age_bracket
0,Lagos,LGA001,Agege,AGE-01,Isale Oja,Isale Oja Area,1782743,377679,322297,315611,Desktop,Freelance,87.96,0.211853,0.180787,0.177037,648958,56.967662,Female,18-25
1,Lagos,LGA001,Agege,AGE-02,Oniwaya,Oniwaya Area,1329911,477653,240576,462356,Feature Phone,Student,94.47,0.359162,0.180896,0.347659,725000,88.77173,Female,36-45
2,Lagos,LGA001,Agege,AGE-03,Orile Agege,Orile Agege Area,881743,235616,368057,302350,iOS,Tips,48.88,0.267216,0.41742,0.3429,612526,102.753637,Male,18-25
3,Lagos,LGA001,Agege,AGE-04,Papa Ashafa,Papa Ashafa Area,2343882,736263,501085,889199,Desktop,Freelance,44.42,0.314121,0.213784,0.37937,1793612,90.727562,Female,26-35
4,Lagos,LGA001,Agege,AGE-05,Iloro,Iloro Area,2742991,1314096,684986,605535,Android,Salary,15.85,0.479074,0.249722,0.220757,1958375,94.955361,Female,18-25
5,Lagos,LGA001,Agege,AGE-06,Oko_Oba,Oko_Oba Area,3506664,1224360,594147,999197,Feature Phone,Student,54.57,0.349152,0.169434,0.284942,1860407,80.352837,Male,26-35
6,Lagos,LGA001,Agege,AGE-07,Dopemu,Dopemu Area,4888669,1193637,1539189,697127,Feature Phone,Tips,97.26,0.244164,0.314848,0.142601,2236279,70.161285,Female,26-35
7,Lagos,LGA001,Agege,AGE-08,Pen_Cinema,Pen_Cinema Area,2389435,555184,919768,462108,Desktop,Student,14.07,0.232349,0.384931,0.193396,1610984,81.0677,Female,26-35
8,Lagos,LGA001,Agege,AGE-09,Agbotikuyo,Agbotikuyo Area,3456612,541738,542791,812942,Feature Phone,Business,96.87,0.156725,0.15703,0.235185,1394495,54.893954,Female,46-60
9,Lagos,LGA001,Agege,AGE-10,Okekoto,Okekoto Area,2703740,642145,967893,647270,Android,Salary,27.88,0.237502,0.357983,0.239398,1363208,83.488353,Female,36-45


In [4]:
DROP_COLS = [
    "state", "lga_code", "ward_code", "ward_name", "area_name",
     "opay_accounts", "moniepoint_accounts",
    "palmpay_penetration", "opay_penetration", "moniepoint_penetration"
]

In [5]:
df = df.drop(columns=DROP_COLS)

In [6]:
print(df.shape)
df.head()

(253, 10)


Unnamed: 0,lga_name,population,palmpay_accounts,device_used,source_of_income,bank_charges,multi_app_users,coverage_percent,Gender,age_bracket
0,Agege,1782743,377679,Desktop,Freelance,87.96,648958,56.967662,Female,18-25
1,Agege,1329911,477653,Feature Phone,Student,94.47,725000,88.77173,Female,36-45
2,Agege,881743,235616,iOS,Tips,48.88,612526,102.753637,Male,18-25
3,Agege,2343882,736263,Desktop,Freelance,44.42,1793612,90.727562,Female,26-35
4,Agege,2742991,1314096,Android,Salary,15.85,1958375,94.955361,Female,18-25


In [7]:
TARGET = "palmpay_user"

CATEGORICAL_COLS = [
    "Gender",
    "device_used",
    "source_of_income",
    "age_bracket",
    "lga_name"
]

NUMERICAL_COLS = [
    "population",
    "bank_charges",
    "multi_app_users",
    "coverage_percent"
]

In [8]:
# Create target based on median
threshold = df['palmpay_accounts'].median()
df[TARGET] = (df['palmpay_accounts'] > threshold).astype(int)

# Features and target
X = df[CATEGORICAL_COLS + NUMERICAL_COLS]
y = df[TARGET]

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [11]:
# Numerical transformer: StandardScaler
num_transformer = StandardScaler()

# Categorical transformer: OneHotEncoder (drop='first' avoids dummy variable trap)
cat_transformer = OneHotEncoder(handle_unknown='ignore', drop='first')

# Combine into ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, NUMERICAL_COLS),
        ("cat", cat_transformer, CATEGORICAL_COLS)
    ]
)

In [12]:
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000, random_state=42))
])

In [13]:
pipeline.fit(X_train, y_train)

In [14]:
y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9607843137254902

Confusion Matrix:
 [[26  0]
 [ 2 23]]

Classification Report:
               precision    recall  f1-score   support

           0       0.93      1.00      0.96        26
           1       1.00      0.92      0.96        25

    accuracy                           0.96        51
   macro avg       0.96      0.96      0.96        51
weighted avg       0.96      0.96      0.96        51



In [15]:
y_pred=pipeline.predict(X_test)
print(y_pred)

[1 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 1 0 1 0 0 1 0 0 1 0 0 0 1 1 0 1 0 1 1 1 0
 1 1 1 0 0 1 0 1 0 1 0 1 0 0]


In [16]:
results_df = pd.DataFrame({
    "Actual": y_test,
    "Predicted": y_pred
})

results_df.head()

Unnamed: 0,Actual,Predicted
208,1,1
144,1,1
90,0,0
118,0,0
1,0,0


In [17]:
joblib.dump(pipeline, "Palmpay_Pipeline1.pkl")
print("✅ Model pipeline saved correctly")

✅ Model pipeline saved correctly


In [18]:
print(type(pipeline))
pipeline

<class 'sklearn.pipeline.Pipeline'>
