In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
from pymongo import MongoClient
import csv
from geopy.geocoders import Bing
from config import BING_API_KEY



In [2]:
# Load data set into pandas
df = pd.read_csv ("heart_2022_with_nans.csv")
# for i, column in enumerate (df.columns, start = 1):
#     print(i, df[column].unique(),'\n\n')
df.shape[0]
# df.shape[1]

445132

In [3]:
df.columns

Index(['State', 'Sex', 'GeneralHealth', 'PhysicalHealthDays',
       'MentalHealthDays', 'LastCheckupTime', 'PhysicalActivities',
       'SleepHours', 'RemovedTeeth', 'HadHeartAttack', 'HadAngina',
       'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD',
       'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis',
       'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty',
       'DifficultyConcentrating', 'DifficultyWalking',
       'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus',
       'ECigaretteUsage', 'ChestScan', 'RaceEthnicityCategory', 'AgeCategory',
       'HeightInMeters', 'WeightInKilograms', 'BMI', 'AlcoholDrinkers',
       'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'TetanusLast10Tdap',
       'HighRiskLastYear', 'CovidPos'],
      dtype='object')

In [4]:
# Define function to get coordinates for a given location (state)
def get_coordinates(state_name):
    geocoder = Bing(api_key=BING_API_KEY)
    try:
        location = geocoder.geocode(state_name + ', USA', exactly_one=True)
        if location:
            return [location.latitude, location.longitude]
        else:
            print('Failed to geocode state:', state_name)
            return None
    except Exception as e:
        print('Error:', e)
        return None

# List of states
states = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", 
    "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", 
    "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", 
    "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", 
    "Massachusetts", "Michigan", "Minnesota", "Mississippi", 
    "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", 
    "New Jersey", "New Mexico", "New York", "North Carolina", 
    "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", 
    "Rhode Island", "South Carolina", "South Dakota", "Tennessee", 
    "Texas", "Utah", "Vermont", "Virginia", "Washington", 
    "West Virginia", "Wisconsin", "Wyoming"
]

# Fetch coordinates for each state
coordinates = {}
for state in states:
    coordinates[state] = get_coordinates(state)

# Function to update CSV file with coordinates
def update_csv_with_coordinates(csv_file, coordinates):
    updated_data = []
    with open(csv_file, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            state = row['State']
            if state in coordinates:
                row['Latitude'], row['Longitude'] = coordinates[state]
            updated_data.append(row)
    
    # Write updated data to a new CSV file
    with open('updated_' + csv_file, 'w', newline='') as updated_file:
        writer = csv.DictWriter(updated_file, fieldnames=updated_data[0].keys())
        writer.writeheader()
        writer.writerows(updated_data)

# Update CSV file with latitude and longitude
update_csv_with_coordinates('heart_2022_with_nans.csv', coordinates)


In [5]:
# Check the dataset
df = pd.read_csv ("updated_heart_2022_with_nans.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445132 entries, 0 to 445131
Data columns (total 42 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      445132 non-null  object 
 1   Sex                        445132 non-null  object 
 2   GeneralHealth              443934 non-null  object 
 3   PhysicalHealthDays         434205 non-null  float64
 4   MentalHealthDays           436065 non-null  float64
 5   LastCheckupTime            436824 non-null  object 
 6   PhysicalActivities         444039 non-null  object 
 7   SleepHours                 439679 non-null  float64
 8   RemovedTeeth               433772 non-null  object 
 9   HadHeartAttack             442067 non-null  object 
 10  HadAngina                  440727 non-null  object 
 11  HadStroke                  443575 non-null  object 
 12  HadAsthma                  443359 non-null  object 
 13  HadSkinCancer              44

In [6]:
columns = ['State', 'Sex', 'GeneralHealth', 'PhysicalHealthDays',
           'MentalHealthDays', 'LastCheckupTime', 'PhysicalActivities',
           'SleepHours', 'RemovedTeeth', 'HadHeartAttack', 'HadAngina',
           'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD',
           'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis',
           'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty',
           'DifficultyConcentrating', 'DifficultyWalking',
           'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus',
           'ECigaretteUsage', 'ChestScan', 'RaceEthnicityCategory', 'AgeCategory',
           'HeightInMeters', 'WeightInKilograms', 'BMI', 'AlcoholDrinkers',
           'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'TetanusLast10Tdap',
           'HighRiskLastYear', 'CovidPos','Latitude','Longitude']

# Check for duplicates based on the specified columns
num_duplicates = df.duplicated(subset=columns).sum()

# Get the duplicated rows
Duplicated_data = df[df.duplicated(subset=columns, keep=False)]

# Remove duplicate rows based on the specified columns
df_no_duplicates = df.drop_duplicates(subset=columns, keep="first")

# Output the results
print(f"Number of duplicate rows: {num_duplicates}")
print("Duplicated data:")
print(Duplicated_data)
print("DataFrame without duplicates:")
print(df_no_duplicates)

Number of duplicate rows: 157
Duplicated data:
             State     Sex GeneralHealth  PhysicalHealthDays  \
4712        Alaska    Male     Very good                 0.0   
7310        Alaska    Male     Very good                 0.0   
10696      Arizona  Female     Excellent                 0.0   
11503      Arizona  Female     Excellent                 0.0   
26789   California    Male     Excellent                 0.0   
...            ...     ...           ...                 ...   
428370   Wisconsin    Male          Good                 0.0   
431351   Wisconsin    Male          Good                 0.0   
431653   Wisconsin    Male     Excellent                 0.0   
433340     Wyoming  Female          Good                 0.0   
433508     Wyoming  Female          Good                 0.0   

        MentalHealthDays                                    LastCheckupTime  \
4712                 0.0  Within past year (anytime less than 12 months ...   
7310                 0.0  

In [7]:
df_no_duplicates.shape[0]

444975

In [8]:
numeric_columns = df_no_duplicates.select_dtypes(include=['int64', 'float64']).columns

# Compute basic statistical measures
stats_summary = df_no_duplicates[numeric_columns].describe().transpose()

# Display the statistical summary
print(stats_summary)

                       count       mean        std        min         25%  \
PhysicalHealthDays  434053.0   4.349372   8.689968    0.00000    0.000000   
MentalHealthDays    435913.0   4.384164   8.388541    0.00000    0.000000   
SleepHours          439527.0   7.022909   1.502618    1.00000    6.000000   
HeightInMeters      416470.0   1.702690   0.107178    0.91000    1.630000   
WeightInKilograms   403044.0  83.074632  21.448241   22.68000   68.040000   
BMI                 396316.0  28.529907   6.554917   12.02000   24.130000   
Latitude            432433.0  39.872577   6.204813   19.61087   37.254669   
Longitude           432433.0 -93.323396  18.821079 -155.52742 -105.547836   

                          50%        75%         max  
PhysicalHealthDays   0.000000   3.000000   30.000000  
MentalHealthDays     0.000000   5.000000   30.000000  
SleepHours           7.000000   8.000000   24.000000  
HeightInMeters       1.700000   1.780000    2.410000  
WeightInKilograms   80.740000  

In [9]:
# Check for NaN values
for column in df_no_duplicates.columns:
    if df_no_duplicates[column].isnull().any():
        print(f"The column '{column}' has NaN values:")
        # Count of NaN values in each column
        print(df_no_duplicates[column].isna().sum())
    else:
        print (f"Checked")

Checked
Checked
The column 'GeneralHealth' has NaN values:
1193
The column 'PhysicalHealthDays' has NaN values:
10922
The column 'MentalHealthDays' has NaN values:
9062
The column 'LastCheckupTime' has NaN values:
8301
The column 'PhysicalActivities' has NaN values:
1088
The column 'SleepHours' has NaN values:
5448
The column 'RemovedTeeth' has NaN values:
11355
The column 'HadHeartAttack' has NaN values:
3060
The column 'HadAngina' has NaN values:
4400
The column 'HadStroke' has NaN values:
1552
The column 'HadAsthma' has NaN values:
1768
The column 'HadSkinCancer' has NaN values:
3138
The column 'HadCOPD' has NaN values:
2214
The column 'HadDepressiveDisorder' has NaN values:
2807
The column 'HadKidneyDisease' has NaN values:
1921
The column 'HadArthritis' has NaN values:
2628
The column 'HadDiabetes' has NaN values:
1082
The column 'DeafOrHardOfHearing' has NaN values:
20502
The column 'BlindOrVisionDifficulty' has NaN values:
21419
The column 'DifficultyConcentrating' has NaN value

In [10]:
# df = df.replace(['NA', 'N/A', '-', 'None'], np.nan, inplace=True)

In [11]:
# Drop rows with any NaN values and keep all the columns
df_cleaned = df_no_duplicates.dropna()
# df_cleaned.isna().sum()
df_cleaned.shape[0]

238407

In [12]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 238407 entries, 342 to 435820
Data columns (total 42 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      238407 non-null  object 
 1   Sex                        238407 non-null  object 
 2   GeneralHealth              238407 non-null  object 
 3   PhysicalHealthDays         238407 non-null  float64
 4   MentalHealthDays           238407 non-null  float64
 5   LastCheckupTime            238407 non-null  object 
 6   PhysicalActivities         238407 non-null  object 
 7   SleepHours                 238407 non-null  float64
 8   RemovedTeeth               238407 non-null  object 
 9   HadHeartAttack             238407 non-null  object 
 10  HadAngina                  238407 non-null  object 
 11  HadStroke                  238407 non-null  object 
 12  HadAsthma                  238407 non-null  object 
 13  HadSkinCancer              23840

In [13]:
# Dropping columns w/ more than 50000 nan: 
columns = [column for column in df_no_duplicates.columns if df_no_duplicates[column].isna().sum() >= 50000]
df_df = df_no_duplicates.drop(columns=columns)
df_df = df_df.dropna()
df_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 302498 entries, 342 to 435824
Data columns (total 36 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      302498 non-null  object 
 1   Sex                        302498 non-null  object 
 2   GeneralHealth              302498 non-null  object 
 3   PhysicalHealthDays         302498 non-null  float64
 4   MentalHealthDays           302498 non-null  float64
 5   LastCheckupTime            302498 non-null  object 
 6   PhysicalActivities         302498 non-null  object 
 7   SleepHours                 302498 non-null  float64
 8   RemovedTeeth               302498 non-null  object 
 9   HadHeartAttack             302498 non-null  object 
 10  HadAngina                  302498 non-null  object 
 11  HadStroke                  302498 non-null  object 
 12  HadAsthma                  302498 non-null  object 
 13  HadSkinCancer              30249

In [14]:
# df_df: Columns got removed before dropna
# df_cleaned: Got cleaned w/o columns removal
print (f"df_df has {df_df.shape[0]} rows")
print (f"df_df has {df_df.shape[1]} columns")
print (f"df_cleaned has {df_cleaned.shape[0]} rows")
print (f"df_cleaned has {df_cleaned.shape[1]} columns")

df_df has 302498 rows
df_df has 36 columns
df_cleaned has 238407 rows
df_cleaned has 42 columns


In [15]:
df_df.to_csv('Heart_36_Col.csv', index=False)

# Save df_cleaned to a CSV file
df_cleaned.to_csv('Heart_42_Col.csv', index=False)

print("DataFrames saved as CSV files successfully!")

DataFrames saved as CSV files successfully!


In [16]:
# # Convert cleaned DataFrames to dictionaries
# data_dict_df = df_df.to_dict(orient='records')
# data_dict_cleaned = df_cleaned.to_dict(orient='records')

# # Connect to MongoDB
# client = MongoClient('mongodb://localhost:27017/')  

# # Access the default database or create it if it doesn't exist
# db = client['Heart_database']  

# # Access or create colleActions
# collection_34_col = db['Heart_dataset_34_Col']
# collection_40_col = db['Herat_dataset_40_Col']  

# # Insert data into collections
# collection_34_col.insert_many(data_dict_df)
# collection_40_col.insert_many(data_dict_cleaned)

# print("Data inserted successfully into MongoDB collections!")

Data inserted successfully into MongoDB collections!


In [17]:
# from geopy.geocoders import Bing
# from geopy.exc import GeocoderTimedOut

# # Define function to get coordinates for a given location (state)
# def get_coordinates(state_name):
#     geocoder = Bing(api_key=BING_API_KEY)
#     coordinates = {}
#     for state in state_name:
#         try:
#             location = geocoder.geocode(state, exactly_one=True)
#             if location:
#                 coordinates[state] = [location.latitude, location.longitude]
#             else:
#                 print('Failed to geocode state:', state)
#         except Exception as e:
#             print('Error:', e)
#     return coordinates

# # List of states
# states = [
#     "Alabama", "Alaska", "Arizona", "Arkansas", "California", 
#     "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", 
#     "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", 
#     "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", 
#     "Massachusetts", "Michigan", "Minnesota", "Mississippi", 
#     "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", 
#     "New Jersey", "New Mexico", "New York", "North Carolina", 
#     "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", 
#     "Rhode Island", "South Carolina", "South Dakota", "Tennessee", 
#     "Texas", "Utah", "Vermont", "Virginia", "Washington", 
#     "West Virginia", "Wisconsin", "Wyoming"
# ]

# # Define function to update MongoDB files with coordinates
# def update_mongodb_with_coordinates(collection, state_name, coordinates):
#     collection.update_many({'State': state_name}, {'$set': {'Coordinates': coordinates}})

# # Access or create collections
# collection_34_col = db['Heart_dataset_34_Col']
# collection_40_col = db['Herat_dataset_40_Col']  

# # Iterate over each state
# for state in states:
#     # Get coordinates for the state
#     coordinates = get_coordinates(state)
    
#     # Update MongoDB collections with coordinates
#     update_mongodb_with_coordinates(collection_34_col, state, coordinates)
#     update_mongodb_with_coordinates(collection_40_col, state, coordinates)

# print("Coordinates updated successfully in MongoDB collections!")

In [18]:
import geopandas as gpd

# Load US states shapefile
us_states = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# Filter US states by name
us_states = us_states[us_states['name'].isin([
    'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
    'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
    'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
    'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
    'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
    'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
    'New Jersey', 'New Mexico', 'New York', 'North Carolina',
    'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
    'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
    'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
    'West Virginia', 'Wisconsin', 'Wyoming'
])]

# Save the filtered data as a GeoJSON file
us_states.to_file("us-states.geojson", driver='GeoJSON')


  us_states = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))


In [19]:
# import folium
# from pymongo import MongoClient
# from folium import plugins
# from IPython.display import HTML

# # Function to create choropleth map using Folium
# def create_choropleth_map(collection):
#     # Connect to MongoDB
#     client = MongoClient('mongodb://localhost:27017/')  # Assuming MongoDB is running on the default port
#     db = client['your_database_name']  # Replace with the name of your database

#     # Create a map centered on the United States
#     m = folium.Map(location=[37.0902, -95.7129], zoom_start=4)

#     # Aggregate data to count data points for each state
#     pipeline = [
#         {"$group": {"_id": "$State", "count": {"$sum": 1}}}
#     ]
#     state_counts = list(collection.aggregate(pipeline))

#     # Add choropleth layer with state data counts
#     folium.Choropleth(
#         geo_data='us-states.geojson',  
#         name='choropleth',
#         data=dict([(state['_id'], state['count']) for state in state_counts]),
#         columns=['State', 'Count'],
#         key_on='feature.id',
#         fill_color='YlGn',
#         fill_opacity=0.7,
#         line_opacity=0.2,
#         legend_name='Data Points Distribution by State'
#     ).add_to(m)

#     # Add circle markers for each state
#     for state in state_counts:
#         folium.CircleMarker(
#             location=[state['_id']['coordinates'][0], state['_id']['coordinates'][1]],
#             radius=state['count'] * 10,  # Adjust the scale as needed
#             color='red',
#             fill=True,
#             fill_color='red',
#             fill_opacity=0.5,
#             tooltip=f"{state['_id']}: {state['count']} cases"
#         ).add_to(m)

#     # Add layer control
#     folium.LayerControl().add_to(m)

#     return m
    
# # Function to display choropleth map using Folium
# def display_choropleth_map(m):
#     # Generate HTML for the map
#     html_map = m._repr_html_()
#     # Display the map
#     display(HTML(html_map))

# # Access or create collections
# client = MongoClient('mongodb://localhost:27017/')
# db = client['your_database_name']
# collection_34_col = db['Heart_dataset_34_Col']
# collection_40_col = db['Heart_dataset_40_Col']

# # Call the function to create choropleth maps for each collection
# choropleth_map_34 = create_choropleth_map(collection_34_col)
# choropleth_map_40 = create_choropleth_map(collection_40_col)

# # Save the maps to HTML files
# choropleth_map_34.save('choropleth_map_34.html')
# choropleth_map_40.save('choropleth_map_40.html')

# # Display the maps
# display_choropleth_map(choropleth_map_34)
# display_choropleth_map(choropleth_map_40)


In [21]:
# from pymongo import MongoClient
# import folium

# def create_choropleth_map_from_mongodb(collection):
#     m = folium.Map(location=[37.0902, -95.7129], zoom_start=4)

#     # Aggregate case counts for each state directly from MongoDB
#     pipeline = [
#         {"$group": {"_id": "$State", "count": {"$sum": 1}}}
#     ]
#     state_counts = collection.aggregate(pipeline)

#     # Iterate over the aggregated data and add markers to the map
#     for state_data in state_counts:
#         state = state_data['_id']
#         count = state_data['count']
#         # Retrieve state coordinates from MongoDB or another source
#         # Adjust the marker size and other properties as needed
#         lat, lon = get_state_coordinates(state)
#         color = get_color_for_count(count)
#         marker_size = math.sqrt(count) * 0.2
#         popup_text = f'<div style="width:250px; height:40px;"><b>State:</b> {state}<br><b>Cases:</b> {count}</div>'
#         folium.CircleMarker(location=[lat, lon], radius=marker_size, color=color, fill=True, fill_opacity=0.6,
#                             popup=popup_text).add_to(m)

#     # Add legend and other elements as before

#     return m

# # Connect to MongoDB
# client = MongoClient('mongodb://localhost:27017/')
# db = client['Heart_database']
# collection_34_col = db['Heart_dataset_34_Col']
# collection_40_col = db['Heart_dataset_40_Col']

# # Call the function to create choropleth maps for each collection
# choropleth_map_34 = create_choropleth_map_from_mongodb(collection_34_col)
# choropleth_map_40 = create_choropleth_map_from_mongodb(collection_40_col)

# # Save the maps to HTML files (optional)
# # choropleth_map_34.save('choropleth_map_34.html')
# # choropleth_map_40.save('choropleth_map_40.html')

# # Convert the maps to HTML and display them
# html_map_34 = choropleth_map_34._repr_html_()
# html_map_40 = choropleth_map_40._repr_html_()

# # Display the maps
# display(HTML(html_map_34))
# display(HTML(html_map_40))


NameError: name 'get_state_coordinates' is not defined

Top Features:
               Feature  Importance
76       HadAngina_Yes    0.081225
75        HadAngina_No    0.079531
5                  BMI    0.048095
4    WeightInKilograms    0.044349
3       HeightInMeters    0.036403
7            Longitude    0.033228
6             Latitude    0.032984
2           SleepHours    0.028691
0   PhysicalHealthDays    0.022667
1     MentalHealthDays    0.017664
