In [21]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from scipy.stats import zscore

# Load your dataset
df = pd.read_csv('merged_file.csv')

# Ensure there are no missing values in latitude and longitude columns
df_clean = df.dropna(subset=['latitude', 'longitude'])

# Function to find neighbors within a radius
def find_neighbors(df_clean, radius=1):
    neighbors = {}
    for i, row in df_clean.iterrows():
        unit_neighbors = []
        for j, other_row in df_clean.iterrows():
            if i != j:
                distance = geodesic((row['latitude'], row['longitude']), (other_row['latitude'], other_row['longitude'])).km
                if distance <= radius:
                    unit_neighbors.append(j)
        neighbors[i] = unit_neighbors
    return neighbors

# Identify neighbors
neighbors = find_neighbors(df_clean)

# Calculate outlier scores based on party votes
outlier_scores = []
parties = df_clean.columns[df_clean.columns.str.contains('APC|LP|PDP|NNPP')]

for unit, neighbor_indices in neighbors.items():
    for party in parties:
        unit_votes = df_clean.loc[unit, party]
        neighbor_votes = df_clean.loc[neighbor_indices, party]
        if len(neighbor_votes) > 0:  # Ensure there are neighbors to compare
            z_scores = zscore(neighbor_votes)
            for idx, z in enumerate(z_scores):
                outlier_scores.append({
                    'unit': df_clean.loc[unit, 'PU-name'],
                    'neighbor': df_clean.loc[neighbor_indices[idx], 'PU-name'],
                    'party': party,
                    'votes': unit_votes,
                    'neighbor_votes': neighbor_votes.iloc[idx],
                    'z_score': z
                })

# Create a DataFrame of outlier scores
outlier_df = pd.DataFrame(outlier_scores)

# Sort by outlier scores
sorted_outliers = outlier_df.sort_values(by='z_score', ascending=False)

# Save the sorted outlier scores to a CSV file
sorted_outliers.to_csv('/mnt/data/outlier_scores.csv', index=False)

# Identify top 3 outliers
top_3_outliers = sorted_outliers.head(3)

# Generate a report
with open('/mnt/data/outlier_report.txt', 'w') as report:
    report.write('Outlier Detection Report\n')
    report.write('========================\n')
    report.write('Methodology:\n')
    report.write('1. Neighboring polling units identified within a 1 km radius.\n')
    report.write('2. Z-scores calculated based on the deviation of votes from neighboring units.\n')
    report.write('3. Polling units with the highest Z-scores considered as outliers.\n\n')
    report.write('Top 3 Outliers:\n')
    report.write('===============\n')
    for _, row in top_3_outliers.iterrows():
        report.write(f"Polling Unit: {row['unit']}, Party: {row['party']}, Votes: {row['votes']}, Neighbor Votes: {row['neighbor_votes']}, Z-Score: {row['z_score']}\n")

# Display the top 3 outliers
print(top_3_outliers)



KeyError: 'PU-name'

In [25]:
df2=pd.read_csv('merged_file.csv')
df2.dropna(subset=['latitude', 'longitude'],inplace=True)
df2.to_csv('excel_work.csv',index=False)