# Anostep Weekly survey as completed by each CHP (updated daily)

Here are some quick analyses to look at the data so far :)
We start by accessing the data and cleaning it up

In [8]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np
import json
import os
import webbrowser
from datetime import datetime, timedelta

def load_cleaned_data(filename="commcare_cleaned_data.csv"):
    """Load the cleaned data from CSV file"""
    try:
        if not os.path.exists(filename):
            print(f"File '{filename}' not found!")
            print("Please run the data loader script first.")
            return None
        
        df = pd.read_csv(filename)
        return df
    
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

def find_columns(df, column_type):
    """Find columns based on type"""
    if column_type == 'username':
        return [col for col in df.columns if 'username' in col.lower()]
    elif column_type == 'anoph':
        return [col for col in df.columns if 'anoph' in col.lower()]
    elif column_type == 'county':
        return [col for col in df.columns if 'county' in col.lower()]
    elif column_type == 'date':
        return [col for col in df.columns if 'collection_date' in col.lower() or 'date' in col.lower()]
    else:
        return []

def prepare_time_data(df):
    """Prepare data with time information"""
    
    # Find date column
    date_cols = find_columns(df, 'date')
    if not date_cols:
        print("No collection date column found")
        return df, None
    
    date_col = date_cols[0]
    print(f"Using date column: {date_col}")
    
    # Convert to datetime
    df_time = df.copy()
    df_time[date_col] = pd.to_datetime(df_time[date_col], errors='coerce')
    
    # Remove rows with invalid dates
    df_time = df_time.dropna(subset=[date_col])
    
    if df_time.empty:
        print("No valid dates found")
        return df, None
    
    # Add week information
    df_time['week_number'] = df_time[date_col].dt.isocalendar().week
    df_time['year'] = df_time[date_col].dt.year
    df_time['year_week'] = df_time[date_col].dt.strftime('%Y-W%U')
    df_time['week_start'] = df_time[date_col].dt.to_period('W').dt.start_time
    
    print(f"Date range: {df_time[date_col].min()} to {df_time[date_col].max()}")
    print(f"Total weeks: {df_time['year_week'].nunique()}")
    
    return df_time, date_col

def create_interactive_time_slider_graph(df):
    """Create interactive graph with time slider for collection weeks"""
    
    # Find required columns
    username_cols = find_columns(df, 'username')
    anoph_cols = find_columns(df, 'anoph')
    county_cols = find_columns(df, 'county')
    
    if not username_cols:
        print("No username column found")
        return None
    
    username_col = username_cols[0]
    anoph_col = anoph_cols[0] if anoph_cols else None
    county_col = county_cols[0] if county_cols else None
    
    # Prepare time data
    df_time, date_col = prepare_time_data(df)
    if date_col is None:
        print("Cannot create time slider without date information")
        return None
    
    # Filter out test user
    df_filtered = df_time[df_time[username_col] != 'an_steph_test'].copy()
    df_filtered = df_filtered.dropna(subset=[username_col])
    df_filtered = df_filtered[df_filtered[username_col] != '']
    
    if df_filtered.empty:
        print("No data found after filtering")
        return None
    
    # Get unique weeks sorted
    unique_weeks = sorted(df_filtered['year_week'].unique())
    unique_counties = sorted(df_filtered[county_col].dropna().unique()) if county_col else []
    
    print(f"Creating time slider for {len(unique_weeks)} weeks")
    
    # Create frames for animation/slider
    frames = []
    
    # Color mapping for anoph values
    color_dict = {}
    if anoph_col:
        anoph_values = df_filtered[anoph_col].unique()
        for col in anoph_values:
            if str(col).lower() == 'yes':
                color_dict[col] = '#2E8B57'  # Sea Green
            elif str(col).lower() == 'no':
                color_dict[col] = '#DC143C'  # Crimson
            elif str(col) == '___' or str(col) == '---' or str(col).strip() == '' or pd.isna(col):
                color_dict[col] = '#8B4513'  # Saddle Brown
            else:
                color_dict[col] = '#708090'  # Slate Gray
    
    # Create data for each week
    for week in unique_weeks:
        week_data = df_filtered[df_filtered['year_week'] == week]
        
        if week_data.empty:
            continue
        
        frame_traces = []
        
        if anoph_col and anoph_col in week_data.columns:
            # Create stacked bar chart for this week
            username_counts = week_data[username_col].value_counts()
            username_counts = username_counts.reindex(sorted(username_counts.index, key=str))
            
            crosstab = pd.crosstab(week_data[username_col], week_data[anoph_col], dropna=False)
            sorted_index = sorted(crosstab.index, key=str)
            crosstab = crosstab.reindex(sorted_index, fill_value=0)
            
            # County mapping for this week
            county_mapping = {}
            if county_col:
                for username in crosstab.index:
                    user_counties = week_data[week_data[username_col] == username][county_col].dropna()
                    if not user_counties.empty:
                        county_mapping[username] = user_counties.mode().iloc[0] if len(user_counties.mode()) > 0 else user_counties.iloc[0]
                    else:
                        county_mapping[username] = "Unknown"
            
            for anoph_value in crosstab.columns:
                values = crosstab[anoph_value].values
                
                # Create hover text for this week
                hover_text = []
                for site, count in zip(crosstab.index, values):
                    if count > 0:
                        total_for_site = crosstab.loc[site].sum()
                        percentage = (count / total_for_site) * 100 if total_for_site > 0 else 0
                        county_name = county_mapping.get(site, "Unknown") if county_mapping else "N/A"
                        hover_text.append(
                            f"<b>{site}</b><br>" +
                            f"Week: {week}<br>" +
                            f"County: {county_name}<br>" +
                            f"Anoph Present: {anoph_value}<br>" +
                            f"Count: {count}<br>" +
                            f"Percentage: {percentage:.1f}%<br>" +
                            f"Total for site: {total_for_site}"
                        )
                    else:
                        hover_text.append("")
                
                frame_traces.append(go.Bar(
                    name=f'Anoph: {anoph_value}',
                    x=crosstab.index,
                    y=values,
                    marker_color=color_dict[anoph_value],
                    marker_line=dict(width=0.5, color='white'),
                    hovertemplate='%{hovertext}<extra></extra>',
                    hovertext=hover_text,
                    opacity=0.8
                ))
        else:
            # Simple bar chart for this week
            username_counts = week_data[username_col].value_counts()
            username_counts = username_counts.reindex(sorted(username_counts.index, key=str))
            
            # County mapping for this week
            county_mapping = {}
            if county_col:
                for username in username_counts.index:
                    user_counties = week_data[week_data[username_col] == username][county_col].dropna()
                    if not user_counties.empty:
                        county_mapping[username] = user_counties.mode().iloc[0] if len(user_counties.mode()) > 0 else user_counties.iloc[0]
                    else:
                        county_mapping[username] = "Unknown"
            
            hover_text = []
            for site, count in zip(username_counts.index, username_counts.values):
                county_name = county_mapping.get(site, "Unknown") if county_mapping else "N/A"
                hover_text.append(
                    f"<b>{site}</b><br>" +
                    f"Week: {week}<br>" +
                    f"County: {county_name}<br>" +
                    f"Total Surveys: {count}"
                )
            
            frame_traces.append(go.Bar(
                x=username_counts.index,
                y=username_counts.values,
                marker_color='steelblue',
                marker_line=dict(width=0.5, color='navy'),
                hovertemplate='%{hovertext}<extra></extra>',
                hovertext=hover_text,
                opacity=0.8
            ))
        
        frames.append(go.Frame(
            data=frame_traces,
            name=week,
            layout=go.Layout(
                title=f'Survey Count by Collection Site - Week {week}<br><sub>Total: {len(week_data):,} surveys</sub>'
            )
        ))
    
    # Create initial figure (first week)
    if frames:
        fig = go.Figure(data=frames[0].data, frames=frames)
    else:
        print("No frames created - no data available")
        return None
    
    # Update layout with slider
    total_surveys = len(df_filtered)
    unique_sites = df_filtered[username_col].nunique()
    unique_counties_count = len(unique_counties)
    
    fig.update_layout(
        title={
            'text': f'Survey Count by Collection Site - Interactive Timeline<br><sub>Total: {total_surveys:,} surveys from {unique_sites} sites across {unique_counties_count} counties</sub>',
            'x': 0.5,
            'xanchor': 'center',
            'font': {'size': 16}
        },
        xaxis_title='Collection Site',
        yaxis_title='Count of Survey',
        barmode='stack' if anoph_col else 'group',
        hovermode='closest',
        width=1200,
        height=700,
        font=dict(size=12),
        showlegend=True if anoph_col else False,
        legend=dict(
            orientation="v",
            yanchor="top",
            y=1,
            xanchor="left",
            x=1.02
        ),
        margin=dict(l=80, r=120, t=120, b=150),
        plot_bgcolor='rgba(0,0,0,0)',
        paper_bgcolor='rgba(0,0,0,0)',
        updatemenus=[
            dict(
                type="buttons",
                direction="left",
                buttons=list([
                    dict(
                        args=[{"frame": {"duration": 500, "redraw": True},
                               "fromcurrent": True, "transition": {"duration": 300}}],
                        label="Play",
                        method="animate"
                    ),
                    dict(
                        args=[{"frame": {"duration": 0, "redraw": True},
                               "mode": "immediate",
                               "transition": {"duration": 0}}],
                        label="Pause",
                        method="animate"
                    )
                ]),
                pad={"r": 10, "t": 87},
                showactive=False,
                x=0.011,
                xanchor="right",
                y=0,
                yanchor="top"
            ),
        ],
        sliders=[
            dict(
                active=0,
                yanchor="top",
                xanchor="left",
                currentvalue={
                    "font": {"size": 16},
                    "prefix": "Week: ",
                    "visible": True,
                    "xanchor": "right"
                },
                transition={"duration": 300, "easing": "cubic-in-out"},
                pad={"b": 10, "t": 50},
                len=0.9,
                x=0.1,
                y=0,
                steps=[
                    dict(
                        args=[
                            [week],
                            {"frame": {"duration": 300, "redraw": True},
                             "mode": "immediate",
                             "transition": {"duration": 300}}
                        ],
                        label=week,
                        method="animate"
                    ) for week in unique_weeks
                ]
            )
        ]
    )
    
    # Update axes
    fig.update_xaxes(
        tickangle=45,
        tickfont=dict(size=10),
        gridcolor='lightgray',
        gridwidth=0.5,
        showgrid=True
    )
    
    fig.update_yaxes(
        gridcolor='lightgray',
        gridwidth=0.5,
        showgrid=True,
        zeroline=True,
        zerolinecolor='gray',
        zerolinewidth=1
    )
    
    return fig, df_filtered, unique_counties, unique_weeks

def create_simple_username_graph(df):
    """Create interactive username count graph colored by anoph_present, excluding 'an_steph_test'"""
    
    # Find username, anoph, and county columns
    username_cols = find_columns(df, 'username')
    anoph_cols = find_columns(df, 'anoph')
    county_cols = find_columns(df, 'county')
    
    if not username_cols:
        print("No username column found")
        return None
    
    username_col = username_cols[0]
    anoph_col = anoph_cols[0] if anoph_cols else None
    county_col = county_cols[0] if county_cols else None
    
    print(f"Using columns - Username: {username_col}")
    if anoph_col:
        print(f"Anoph: {anoph_col}")
    if county_col:
        print(f"County: {county_col}")
    
    # Filter out 'an_steph_test' and remove rows with missing usernames
    df_filtered = df[df[username_col] != 'an_steph_test'].copy()
    df_filtered = df_filtered.dropna(subset=[username_col])
    df_filtered = df_filtered[df_filtered[username_col] != '']
    
    if df_filtered.empty:
        print("No data found after filtering")
        return None
    
    # Get username counts and sort alphabetically
    df_filtered[username_col] = df_filtered[username_col].astype(str)
    username_counts = df_filtered[username_col].value_counts()
    username_counts = username_counts.reindex(sorted(username_counts.index, key=str))
    
    # Create county mapping for each username
    county_mapping = {}
    if county_col:
        for username in username_counts.index:
            user_counties = df_filtered[df_filtered[username_col] == username][county_col].dropna()
            if not user_counties.empty:
                # Get the most common county for this username
                county_mapping[username] = user_counties.mode().iloc[0] if len(user_counties.mode()) > 0 else user_counties.iloc[0]
            else:
                county_mapping[username] = "Unknown"
    
    # Get unique counties for the county list
    unique_counties = []
    if county_col:
        unique_counties = sorted(df_filtered[county_col].dropna().unique())
    
    # Create interactive figure
    fig = go.Figure()
    
    if anoph_col and anoph_col in df_filtered.columns:
        # Create stacked bar chart colored by anoph_present
        crosstab = pd.crosstab(df_filtered[username_col], df_filtered[anoph_col], dropna=False)
        sorted_index = sorted(crosstab.index, key=str)
        crosstab = crosstab.reindex(sorted_index, fill_value=0)
        
        # Define colors for different anoph values
        color_dict = {}
        for col in crosstab.columns:
            if str(col).lower() == 'yes':
                color_dict[col] = '#2E8B57'  # Sea Green
            elif str(col).lower() == 'no':
                color_dict[col] = '#DC143C'  # Crimson
            elif str(col) == '___' or str(col) == '---' or str(col).strip() == '' or pd.isna(col):
                color_dict[col] = '#8B4513'  # Saddle Brown
            else:
                color_dict[col] = '#708090'  # Slate Gray
        
        # Create stacked bars
        for anoph_value in crosstab.columns:
            values = crosstab[anoph_value].values
            
            # Create hover text
            hover_text = []
            for i, (site, count) in enumerate(zip(crosstab.index, values)):
                if count > 0:
                    total_for_site = crosstab.loc[site].sum()
                    percentage = (count / total_for_site) * 100 if total_for_site > 0 else 0
                    county_name = county_mapping.get(site, "Unknown") if county_mapping else "N/A"
                    hover_text.append(
                        f"<b>{site}</b><br>" +
                        f"County: {county_name}<br>" +
                        f"Anoph Present: {anoph_value}<br>" +
                        f"Count: {count}<br>" +
                        f"Percentage: {percentage:.1f}%<br>" +
                        f"Total for site: {total_for_site}"
                    )
                else:
                    hover_text.append("")
            
            fig.add_trace(go.Bar(
                name=f'Anoph: {anoph_value}',
                x=crosstab.index,
                y=values,
                marker_color=color_dict[anoph_value],
                marker_line=dict(width=0.5, color='white'),
                hovertemplate='%{hovertext}<extra></extra>',
                hovertext=hover_text,
                opacity=0.8
            ))
            
    else:
        # Simple bar chart if no anoph data
        hover_text = []
        for site, count in zip(username_counts.index, username_counts.values):
            county_name = county_mapping.get(site, "Unknown") if county_mapping else "N/A"
            hover_text.append(
                f"<b>{site}</b><br>" +
                f"County: {county_name}<br>" +
                f"Total Surveys: {count}"
            )
        
        fig.add_trace(go.Bar(
            x=username_counts.index,
            y=username_counts.values,
            marker_color='steelblue',
            marker_line=dict(width=0.5, color='navy'),
            hovertemplate='%{hovertext}<extra></extra>',
            hovertext=hover_text,
            opacity=0.8
        ))
    
    # Update layout
    total_surveys = len(df_filtered)
    unique_sites = len(username_counts)
    unique_counties_count = len(unique_counties) if unique_counties else 0
    
    title_text = f'Survey Count by Collection Site<br><sub>Total: {total_surveys:,} surveys from {unique_sites} sites'
    if unique_counties_count > 0:
        title_text += f' across {unique_counties_count} counties</sub>'
    else:
        title_text += '</sub>'
    
    fig.update_layout(
        title={
            'text': title_text,
            'x': 0.5,
            'xanchor': 'center',
            'font': {'size': 16}
        },
        xaxis_title='Collection Site',
        yaxis_title='Count of Survey',
        barmode='stack' if anoph_col else 'group',
        hovermode='closest',
        width=1400,
        height=700,
        font=dict(size=12),
        showlegend=True if anoph_col else False,
        legend=dict(
            orientation="v",
            yanchor="top",
            y=1,
            xanchor="left",
            x=1.02
        ),
        margin=dict(l=80, r=200, t=100, b=150),
        plot_bgcolor='rgba(0,0,0,0)',
        paper_bgcolor='rgba(0,0,0,0)'
    )
    
    # Update x-axis
    fig.update_xaxes(
        tickangle=45,
        tickfont=dict(size=10),
        gridcolor='lightgray',
        gridwidth=0.5,
        showgrid=True
    )
    
    # Update y-axis
    fig.update_yaxes(
        gridcolor='lightgray',
        gridwidth=0.5,
        showgrid=True,
        zeroline=True,
        zerolinecolor='gray',
        zerolinewidth=1
    )
    
    return fig, df_filtered, unique_counties

def main():
    try:
        # Load the cleaned data
        df = load_cleaned_data()
        if df is None:
            return
        
        print("Creating interactive survey visualization with time slider...")
        
        # Create interactive graph with time slider
        result = create_interactive_time_slider_graph(df)
        
        if result is not None:
            if len(result) == 4:
                fig, df_filtered, unique_counties, unique_weeks = result
            else:
                print("Unexpected result format")
                return
            
            # Save as HTML
            main_filename = 'weekly_interactive_survey_count.html'
            fig.write_html(
                main_filename,
                config={
                    'displayModeBar': True,
                    'displaylogo': False,
                    'modeBarButtonsToAdd': ['drawline', 'drawopenpath', 'drawclosedpath', 'drawcircle', 'drawrect', 'eraseshape']
                }
            )
            
            print(f"Interactive time-slider graph saved as: {main_filename}")
            print(f"Time range: {len(unique_weeks)} weeks of data")
            if unique_counties:
                print(f"Counties included: {', '.join(unique_counties)}")
            
            # Open in browser
            try:
                print("Opening interactive visualization in browser...")
                main_path = os.path.abspath(main_filename)
                webbrowser.open(f'file://{main_path}')
                
            except Exception as e:
                print(f"Could not open browser automatically: {e}")
                print(f"Please manually open: {main_filename}")
            
            print("\nInteractive features:")
            print("- Time slider to navigate through collection weeks")
            print("- Play/Pause buttons for animated timeline")
            print("- Hover over bars for detailed information (including county and week)")
            print("- Click legend items to show/hide categories")
            print("- Use toolbar for zoom, pan, and selection")
            print("- Double-click to reset zoom")
            
        else:
            print("Failed to create interactive graph")
    
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Creating interactive survey visualization with time slider...
Using date column: collection_date
Date range: 2025-07-30 00:00:00 to 2025-09-18 00:00:00
Total weeks: 8
Creating time slider for 8 weeks
Interactive time-slider graph saved as: weekly_interactive_survey_count.html
Time range: 8 weeks of data
Counties included: Baringo, Bungoma, Garissa, Kakamega, Kisumu, Kitui, Lamu, Machakos, Mandera, Marsabit, Meru, Migori, Nakuru, Narok, Samburu, Tana River, Turkana, Wajir, West Pokot
Opening interactive visualization in browser...

Interactive features:
- Time slider to navigate through collection weeks
- Play/Pause buttons for animated timeline
- Hover over bars for detailed information (including county and week)
- Click legend items to show/hide categories
- Use toolbar for zoom, pan, and selection
- Double-click to reset zoom


### Great work team!👏 👏 👏

In [13]:
import pandas as pd
import numpy as np

def load_and_analyze_distance_data(filename="commcare_cleaned_data.csv"):
    """Load data and analyze distance_from_site values greater than 2000"""
    
    try:
        # Load the data
        df = pd.read_csv(filename)
        # print(f"Loaded dataset with {len(df)} rows and {len(df.columns)} columns")
        
        # # Check if distance_from_site column exists
        # if 'distance_from_site' not in df.columns:
        #     print("Error: 'distance_from_site' column not found in the dataset")
        #     print("Available columns:", list(df.columns))
        #     return None
        
        # # Check current data type and sample values
        # print(f"\nOriginal 'distance_from_site' column info:")
        # print(f"Data type: {df['distance_from_site'].dtype}")
        # print(f"Sample values: {df['distance_from_site'].head(10).tolist()}")
        # print(f"Unique values count: {df['distance_from_site'].nunique()}")
        # print(f"Null values: {df['distance_from_site'].isnull().sum()}")
        
        # # Convert distance_from_site to numeric, handling any non-numeric values
        # print(f"\nConverting 'distance_from_site' to integer...")
        
        # First convert to numeric, coercing errors to NaN
        df['distance_from_site_numeric'] = pd.to_numeric(df['distance_from_site'], errors='coerce')
        
        # Check for any values that couldn't be converted
        conversion_issues = df[df['distance_from_site_numeric'].isnull() & df['distance_from_site'].notna()]
        if not conversion_issues.empty:
            print(f"Warning: {len(conversion_issues)} values could not be converted to numeric:")
            print(conversion_issues['distance_from_site'].unique())
        
        # Convert to integer (will convert NaN to 0 or handle appropriately)
        df['distance_from_site_int'] = df['distance_from_site_numeric'].fillna(0).astype(int)
        
        # print(f"Conversion complete!")
        # print(f"New data type: {df['distance_from_site_int'].dtype}")
        # print(f"Value range: {df['distance_from_site_int'].min()} to {df['distance_from_site_int'].max()}")
        
        # Filter for values greater than 2000
        high_distance_mask = df['distance_from_site_int'] > 2000
        high_distance_df = df[high_distance_mask].copy()
        
        # print(f"\nFiltering for distance_from_site > 2000:")
        # print(f"Found {len(high_distance_df)} rows with distance > 2000")
        
        if len(high_distance_df) == 0:
            print("No records found with distance_from_site > 2000")
            
            # # Show distribution of distance values to help understand the data
            # print(f"\nDistance value distribution:")
            # print(df['distance_from_site_int'].describe())
            
            # # Show top 10 highest distances
            # top_distances = df.nlargest(10, 'distance_from_site_int')
            # print(f"\nTop 10 highest distance values:")
            # for idx, row in top_distances.iterrows():
            #     print(f"  {row['distance_from_site_int']} - {row.get('username', 'N/A')}")
            
            return df
        
        # Check if required columns exist
        required_columns = ['username', 'county', 'anoph_present', 'interviewer']
        missing_columns = [col for col in required_columns if col not in df.columns]
        
        if missing_columns:
            print(f"Warning: Missing columns: {missing_columns}")
            print("Available columns that might be relevant:")
            
            # Look for similar column names
            for missing_col in missing_columns:
                similar_cols = [col for col in df.columns if missing_col.lower() in col.lower() or col.lower() in missing_col.lower()]
                if similar_cols:
                    print(f"  For '{missing_col}': {similar_cols}")
            
            # Use available columns
            available_columns = ['distance_from_site_int'] + [col for col in required_columns if col in df.columns]
            additional_cols = [col for col in df.columns if any(keyword in col.lower() for keyword in ['username', 'user', 'county', 'anoph', 'CHP'])]
            available_columns.extend([col for col in additional_cols if col not in available_columns])
        else:
            available_columns = ['distance_from_site_int'] + required_columns
        
        # Select and display the filtered data
        result_df = high_distance_df[available_columns].copy()
        
        print(f"\nDisplaying {len(result_df)} records with distance > 2000:")
        print("=" * 80)
        
        # Display all rows
        pd.set_option('display.max_rows', None)
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', None)
        pd.set_option('display.max_colwidth', 50)
        
        print(result_df.to_string(index=False))
        
        # Summary statistics
        print(f"\n" + "=" * 80)
        print(f"SUMMARY STATISTICS:")
        print(f"Total records with distance > 2000: {len(result_df)}")
        
        if 'username' in result_df.columns:
            print(f"Unique users: {result_df['username'].nunique()}")
            user_counts = result_df['username'].value_counts()
            print(f"Most frequent user: {user_counts.index[0]} ({user_counts.iloc[0]} records)")
        
        if 'county' in result_df.columns:
            print(f"Unique counties: {result_df['county'].nunique()}")
            county_counts = result_df['county'].value_counts()
            print(f"Counties represented: {', '.join(county_counts.index.tolist())}")
        
        if 'anoph_present' in result_df.columns:
            anoph_counts = result_df['anoph_present'].value_counts()
            print(f"Anopheles presence distribution:")
            for value, count in anoph_counts.items():
                print(f"  {value}: {count} records ({count/len(result_df)*100:.1f}%)")
        
        print(f"Distance statistics for filtered data:")
        print(f"  Min: {result_df['distance_from_site_int'].min()}")
        print(f"  Max: {result_df['distance_from_site_int'].max()}")
        print(f"  Mean: {result_df['distance_from_site_int'].mean():.1f}")
        print(f"  Median: {result_df['distance_from_site_int'].median():.1f}")
        
        return result_df
        
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found!")
        return None
    except Exception as e:
        print(f"Error processing data: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

def main():
    """Main function to run the distance analysis"""
    print("Distance Analysis: Records with distance_from_site > 2000")
    print("=" * 60)
    
    # Load and analyze the data
    result = load_and_analyze_distance_data()
    
    if result is not None and len(result) > 0:
        print(f"\nAnalysis complete! Found {len(result)} records with distance > 2000.")
        
        # Optionally save to CSV
        output_filename = "high_distance_records.csv"
        result.to_csv(output_filename, index=False)
        print(f"Results saved to: {output_filename}")
    else:
        print("\nNo records found or analysis failed.")

if __name__ == "__main__":
    main()

Distance Analysis: Records with distance_from_site > 2000

Filtering for distance_from_site > 2000:
Found 444 rows with distance > 2000

Displaying 444 records with distance > 2000:
 distance_from_site_int username     county anoph_present             interviewer
                   3488     gp26    Turkana           yes     Ebei Loroti Carlmax
                   4955     gp26    Turkana            no     Ebei Loroti Carlmax
                   3588     gp22    Turkana           yes     Ekeno Lokiridi Alex
                   4543     gp22    Turkana           yes     Ekeno Lokiridi Alex
                   2035     gp26    Turkana           yes     Ebei Loroti Carlmax
                   6735     gp47    Turkana            no   Stephen Ekuwom Ewoton
                   6742     gp47    Turkana            no   Stephen Ekuwom Ewoton
                   3546     gp22    Turkana           yes     Ekeno Lokiridi Alex
                   3706     gp22    Turkana           yes     Ekeno Lokiridi Ale