<div style="border:2px solid black; padding:10px">
    
# <font color="blue">Objective: </font>Compare US data to the rest of the world
</div>

# Import Dependencies

In [1]:
import pandas as pd

# DBSCAN: A Clustering Algorithm for Grouping Data Based on Spatial Density
from sklearn.cluster import DBSCAN

# ignore all future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

# Displaying pandas columns and rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Import data
%store -r df
%store -r gc

# Import functions from other jupyter notebook
import nbimporter
from disease_headlines_part3 import great_circle_distance

Importing Jupyter notebook from disease_headlines_part3.ipynb


<hr style="border-top: 2px solid black;">

# Compare US data to the rest of the world

- re-cluster US cities by modifying epsilon parameter

In [2]:
#  This function will map a country code to a city
def get_country_code(city_name):
    city = max(gc.get_cities_by_name(city_name),
               key=lambda x: list(x.values())[0]['population'])
    return list(city.values())[0]['countrycode']

# Apply the country code a new column in the entire dataframe
# This country code will then be used to filter out US
df['Country_code'] = df.City.apply(get_country_code)

In [3]:
# Separate US data from other countries
us_df = df[df.Country_code == 'US']
df_not_us = df[df.Country_code != 'US']

### Re-clustering function

In [4]:
# Re-clustering function that takes in a dataframe and the eps: 
# The eps defines the distance between clusters
def re_cluster(input_df, eps):
    input_coord = input_df[['Latitude', 'Longitude']].values
    dbscan = DBSCAN(eps=eps, min_samples=3,
                    metric=great_circle_distance)
    clusters = dbscan.fit_predict(input_coord)
    input_df = input_df.assign(Cluster=clusters)
    return input_df[input_df.Cluster > -1]

In [5]:
# Apply function to US filtered data and all other countries
# Reduce the disntace (eps) for the US so to get a tighter density
df_not_us = re_cluster(df_not_us, 250)
us_df = re_cluster(us_df, 125)

### Evaluate grouped data
 - Determine how many clusters are in the non-us countries and US
 - Find the largest cluster (nbr of headlines)

In [6]:
# Group the non US dataset by the cluster
groups = df_not_us.groupby('Cluster')
num_groups = len(groups)
print(f"{num_groups} Non-US have been clusters detected")

31 Non-US have been clusters detected


### Return the data from the largest cluster <code>largest_group</code>

In [7]:
sorted_groups = sorted(groups, key=lambda x: len(x[1]),
                       reverse=True)
group_id, largest_group = sorted_groups[0]
group_size = len(largest_group)
print(f"Largest cluster contains {group_size} headlines")

Largest cluster contains 51 headlines


<div style="border:1px solid black; padding:10px">
<font color="blue">Note:</font><br>
There are 31 clusters of headlines outside the US. <br>
The largest cluster contains 51 headlines.<br>
</div>

In [8]:
# Store variables
%store df_not_us
%store us_df
%store groups
%store largest_group
%store sorted_groups

Stored 'df_not_us' (DataFrame)
Stored 'us_df' (DataFrame)
Stored 'groups' (DataFrameGroupBy)
Stored 'largest_group' (DataFrame)
Stored 'sorted_groups' (list)
