## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import haversine_distances

## Loading the Dataset

In [None]:
df = pd.read_csv("/content/drive/MyDrive/outlier analysis/ZAMFARA & Coordinates.csv")
df.head()

Unnamed: 0,State,LGA,PU-Name,Ward,Latitude,Longitude,APC,LP,PDP,NNPP
0,ZAMFARA,ANKA,BAGEGA II/MAKARANTA,BAGEGA,11.86603,5.99806,86,0,36,3
1,ZAMFARA,BAKURA,SHIYAR TUDU/PRIMARY SCHOOL,BIRNIN TUDU,7.07464,6.28116,295,0,55,0
2,ZAMFARA,BAKURA,S/AJIYA II/VILLAGE HEAD OFFICE,BIRNIN TUDU,12.08621,5.29921,184,0,74,0
3,ZAMFARA,BAKURA,DAMRI I/MODEL PRIMARY SCHOOL,DAMRI,7.07464,6.28116,511,0,37,0
4,ZAMFARA,BAKURA,MADACCI II/ADULT EDU. CLASS,DANKADU,12.209366,4.563697,129,0,204,14


## Clustering

In [None]:
# Converting degrees to radians for haversine calculation
df['lat_rad'] = np.radians(df['Latitude'])
df['lon_rad'] = np.radians(df['Longitude'])

# Creating a pairwise distance matrix using haversine
coords = df[['lat_rad', 'lon_rad']].values
distance_matrix = haversine_distances(coords) * 6371

# Applying DBSCAN clustering
db = DBSCAN(eps=1.0, min_samples=1, metric='precomputed')
df['cluster'] = db.fit_predict(distance_matrix)

# Dropping intermediate columns
df = df.drop(columns=['lat_rad', 'lon_rad'])

# showing the first few datasets with clusters
df.head()

Unnamed: 0,State,LGA,PU-Name,Ward,Latitude,Longitude,APC,LP,PDP,NNPP,cluster
0,ZAMFARA,ANKA,BAGEGA II/MAKARANTA,BAGEGA,11.86603,5.99806,86,0,36,3,0
1,ZAMFARA,BAKURA,SHIYAR TUDU/PRIMARY SCHOOL,BIRNIN TUDU,7.07464,6.28116,295,0,55,0,1
2,ZAMFARA,BAKURA,S/AJIYA II/VILLAGE HEAD OFFICE,BIRNIN TUDU,12.08621,5.29921,184,0,74,0,2
3,ZAMFARA,BAKURA,DAMRI I/MODEL PRIMARY SCHOOL,DAMRI,7.07464,6.28116,511,0,37,0,1
4,ZAMFARA,BAKURA,MADACCI II/ADULT EDU. CLASS,DANKADU,12.209366,4.563697,129,0,204,14,3


## Outlier Score Calculation

In [None]:
# Function to calculate outlier scores
def calculate_outlier_score(df):
    df['OutlierScore'] = 0
    for i, row in df.iterrows():
        current_cluster = row['cluster']
        neighbors = df[df['cluster'] == current_cluster]
        outlier_score = (
            abs(row['APC'] - neighbors['APC'].mean()) +
            abs(row['LP'] - neighbors['LP'].mean()) +
            abs(row['PDP'] - neighbors['PDP'].mean()) +
            abs(row['NNPP'] - neighbors['NNPP'].mean())
        )
        df.at[i, 'OutlierScore'] = outlier_score
    return df

# Calculating the outlier scores
df = calculate_outlier_score(df)

# Displaying the first few rows of the dataset with outlier scores
df.head()

Unnamed: 0,State,LGA,PU-Name,Ward,Latitude,Longitude,APC,LP,PDP,NNPP,cluster,OutlierScore
0,ZAMFARA,ANKA,BAGEGA II/MAKARANTA,BAGEGA,11.86603,5.99806,86,0,36,3,0,0.0
1,ZAMFARA,BAKURA,SHIYAR TUDU/PRIMARY SCHOOL,BIRNIN TUDU,7.07464,6.28116,295,0,55,0,1,124.111111
2,ZAMFARA,BAKURA,S/AJIYA II/VILLAGE HEAD OFFICE,BIRNIN TUDU,12.08621,5.29921,184,0,74,0,2,0.0
3,ZAMFARA,BAKURA,DAMRI I/MODEL PRIMARY SCHOOL,DAMRI,7.07464,6.28116,511,0,37,0,1,358.111111
4,ZAMFARA,BAKURA,MADACCI II/ADULT EDU. CLASS,DANKADU,12.209366,4.563697,129,0,204,14,3,0.0


In [None]:
# Exporting the processed dataset to a CSV file
df.to_csv('/content/drive/MyDrive/outlier analysis/processed_dataset.csv', index=False)
