COUNTRY


In [12]:
import pandas as pd
import requests
from io import StringIO

# source for the most populated cities: https://worldpopulationreview.com/cities
city_url = 'https://worldpopulationreview.com/cities'

# got the 403 forbidden error so defining a user-agent header to mimic a web browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

try:
    response = requests.get(city_url, headers=headers)
    response.raise_for_status()  # This will raise an HTTPError for bad responses (4xx or 5xx)

    tables = pd.read_html(StringIO(response.text))

    df_cities = tables[0]

    #cleaning the df to get just the city names
    cities = df_cities['City'].tolist()
    print("Successful")
    print(f"Found {len(cities)} cities.")

except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")
except IndexError:
    print("Could not find the expected table on the page.")

Successful
Found 822 cities.


In [29]:
#get only the first 500
cities = cities[:500]

In [31]:
cities = pd.DataFrame(cities, columns=['City'])

In [32]:
cities

Unnamed: 0,City
0,Tokyo
1,Delhi
2,Shanghai
3,Dhaka
4,Cairo
...,...
495,Aguascalientes
496,Siliguri
497,Amsterdam
498,Tshikapa


COUNTRY

In [17]:
# URL to download the official country list CSV (ISO 3166-1 standard) from DataHub.io
url = "https://datahub.io/core/country-list/r/data.csv"

# Read CSV directly from the URL
df_countries = pd.read_csv(url)

# Display count and first few entries
print(f"Total entries: {len(df_countries)}")
print(df_countries.head())


Total entries: 249
             Name Code
0     Afghanistan   AF
1   Åland Islands   AX
2         Albania   AL
3         Algeria   DZ
4  American Samoa   AS


In [19]:
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.4.0


In [22]:
import pandas as pd
from unidecode import unidecode

# Assuming df_countries is your DataFrame from before
# Extract only the 'Name' column
country_names = df_countries['Name'].copy()

# Convert special characters to ASCII equivalents
countries = country_names.apply(unidecode)

# Display the result
print(countries.head())


0       Afghanistan
1     Aland Islands
2           Albania
3           Algeria
4    American Samoa
Name: Name, dtype: object


In [24]:
countries

Unnamed: 0,Name
0,Afghanistan
1,Aland Islands
2,Albania
3,Algeria
4,American Samoa
...,...
244,Wallis and Futuna
245,Western Sahara
246,Yemen
247,Zambia


Combining both

In [35]:
# If cities or countries are Series (single column), convert to DataFrame
if isinstance(cities, pd.Series):
    cities = cities.to_frame()

if isinstance(countries, pd.Series):
    countries = countries.to_frame()

# Now concatenate with a type column
combined = pd.concat([
    cities.assign(Type='City'),
    countries.assign(Type='Country')
], ignore_index=True)

print(combined)



         City     Type               Name
0       Tokyo     City                NaN
1       Delhi     City                NaN
2    Shanghai     City                NaN
3       Dhaka     City                NaN
4       Cairo     City                NaN
..        ...      ...                ...
744       NaN  Country  Wallis and Futuna
745       NaN  Country     Western Sahara
746       NaN  Country              Yemen
747       NaN  Country             Zambia
748       NaN  Country           Zimbabwe

[749 rows x 3 columns]


# Starting making the graphs, country first

In [45]:
import networkx as nx
import plotly.graph_objects as go

# Build directed graph by last/first letter
G = nx.DiGraph()
G.add_nodes_from(countries)
for a in countries:
    for b in countries:
        if a != b and a.strip()[-1].lower() == b.strip()[0].lower():
            G.add_edge(a, b)

# Layout for nodes
pos = nx.spring_layout(G, k=0.15, iterations=50)
x_nodes = [pos[k][0] for k in G.nodes()]
y_nodes = [pos[k][1] for k in G.nodes()]

# Edge coordinates
edge_x = []
edge_y = []
for e in G.edges():
    x0, y0 = pos[e[0]]
    x1, y1 = pos[e[1]]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]

# Make plotly graph
edge_trace = go.Scatter(x=edge_x, y=edge_y,
                        line=dict(width=0.5, color='#888'),
                        hoverinfo='none', mode='lines')

node_trace = go.Scatter(x=x_nodes, y=y_nodes,
                        mode='markers+text',
                        text=list(G.nodes()),
                        hoverinfo='text',
                        marker=dict(showscale=True,
                                    color='skyblue',
                                    size=10,
                                    line=dict(width=2)),
                        textposition="top center")

fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(showlegend=False,
                                 hovermode='closest',
                                 margin=dict(b=0,l=0,r=0,t=0),
                                 xaxis=dict(showgrid=False, zeroline=False),
                                 yaxis=dict(showgrid=False, zeroline=False),
                                 title="Interactive Country Chain Graph"))

fig.show()


In [40]:
!pip install pyvis

Collecting pyvis
  Downloading pyvis-0.3.2-py3-none-any.whl.metadata (1.7 kB)
Collecting jedi>=0.16 (from ipython>=5.3.0->pyvis)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading pyvis-0.3.2-py3-none-any.whl (756 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi, pyvis
Successfully installed jedi-0.19.2 pyvis-0.3.2


In [47]:
import pandas as pd
import networkx as nx
import plotly.graph_objects as go
from unidecode import unidecode

# Load and normalize country names
url = "https://datahub.io/core/country-list/r/data.csv"
df = pd.read_csv(url)
countries = df['Name'].apply(unidecode)

# Create directed graph
G = nx.DiGraph()
G.add_nodes_from(countries)
for a in countries:
    for b in countries:
        if a != b and a.strip()[-1].lower() == b.strip()[0].lower():
            G.add_edge(a, b)

# Layout positions
pos = nx.spring_layout(G, k=0.15, iterations=50)
node_x = [pos[node][0] for node in G.nodes()]
node_y = [pos[node][1] for node in G.nodes()]

# Build edge coordinates for Plotly
edge_x, edge_y = [], []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none', mode='lines'
)

# Prepare hover text with in-degree and out-degree info
hover_texts = []
for node in G.nodes():
    indeg = G.in_degree(node)
    outdeg = G.out_degree(node)
    text = f"<b>{node}</b><br>Pointed to by: {indeg}<br>Points to: {outdeg}"
    hover_texts.append(text)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    text=[node for node in G.nodes()],
    hoverinfo='text',
    hovertext=hover_texts,
    marker=dict(
        size=10,
        color='skyblue',
        line_width=2
    ),
    textposition="top center"
)

fig = go.Figure(data=[edge_trace, node_trace],
    layout=go.Layout(
        showlegend=False,
        hovermode='closest',
        margin=dict(b=0,l=0,r=0,t=30),
        xaxis=dict(showgrid=False, zeroline=False),
        yaxis=dict(showgrid=False, zeroline=False),
        title="Interactive Country Chain Directed Graph"
    ))

fig.show()
