In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
import folium
from IPython.display import display

# Function to load and prepare dataset
def load_and_prepare_data(filepath):
    dataset = pd.read_excel(filepath)  # Using read_excel for .xls files
    print("Before handling missing values:")
    print(dataset.isnull().sum())
    
    # Impute missing values
    imputer = SimpleImputer(strategy='mean')
    dataset[['latitude', 'longitude']] = imputer.fit_transform(dataset[['latitude', 'longitude']])

    print("After handling missing values:")
    print(dataset.isnull().sum())
    
    return dataset

# Function to perform clustering
def perform_clustering(data, num_clusters=3):
    features = data[['latitude', 'longitude']]
    kmeans_model = KMeans(n_clusters=num_clusters)
    data['cluster'] = kmeans_model.fit_predict(features)
    return data, kmeans_model.cluster_centers_

# Function to calculate cluster counts
def get_cluster_counts(data):
    return data['cluster'].value_counts().to_dict()

# Function to create a map with clusters
def create_cluster_map(data, cluster_counts, map_center, colors):
    map_clusters = folium.Map(location=map_center, zoom_start=5)
    for _, row in data.iterrows():
        folium.CircleMarker(
            location=[row['latitude'], row['longitude']],
            radius=5,
            color=colors[row['cluster']],
            fill=True,
            fill_color=colors[row['cluster']],
            fill_opacity=0.7,
            popup=f"Cluster: {row['cluster']}<br>Incidents: {cluster_counts[row['cluster']]}",
        ).add_to(map_clusters)
    return map_clusters

# Main function to run all steps
def main():
    # File path and parameters
    filepath = '/Users/charishyadavali/Downloads/fatal-police-shootings-data-continental-us.xls'  # Updated path for uploaded file
    num_clusters = 3
    colors = ['red', 'blue', 'green']
    
    # Load data and handle missing values
    dataset = load_and_prepare_data(filepath)

    # Perform clustering
    dataset, cluster_centers = perform_clustering(dataset, num_clusters)

    # Get cluster incident counts
    cluster_counts = get_cluster_counts(dataset)

    # Create map centered on mean latitude and longitude
    map_center = [dataset['latitude'].mean(), dataset['longitude'].mean()]
    cluster_map = create_cluster_map(dataset, cluster_counts, map_center, colors)
    
    # Save and display the map
    cluster_map.save("cluster_map2.html")
    print("Map saved as cluster_map2.html")
    
    # Display the map inline in Jupyter Notebook or compatible environment
    display(cluster_map)

# Run the main function
if __name__ == "__main__":
    cluster_map = main()


Before handling missing values:
id                           0
name                       262
date                         0
manner_of_death              0
armed                      203
age                        304
gender                       3
race                       711
city                         0
state                        0
signs_of_mental_illness      0
threat_level                 0
flee                       472
body_camera                  0
latitude                   303
longitude                  303
is_geocoding_exact           0
dtype: int64
After handling missing values:
id                           0
name                       262
date                         0
manner_of_death              0
armed                      203
age                        304
gender                       3
race                       711
city                         0
state                        0
signs_of_mental_illness      0
threat_level                 0
flee                     

  super()._check_params_vs_input(X, default_n_init=10)


Map saved as cluster_map2.html
