In [None]:
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd

# 1. Getting the webpage content of Wikipedia
wikipedia = requests.get("https://en.wikipedia.org/wiki/NE_postcode_area").content

# 2. Converting the HTML content to a BeautifulSoup object
wikipedia_soup = BeautifulSoup(wikipedia, "html.parser")

# 3. Assigning the table content from Wikipedia to variable
neighborhoods = wikipedia_soup.table

# 4. Converting the tags to lists
neighborhoods_lists = []
for tag in neighborhoods.find_all("tr"):
    temp_list = []
    temp_split = tag.text.split("\n")
    for i in range(len(temp_split)):
        if i in [1, 3, 5]:
            temp_list.append(temp_split[i])
    neighborhoods_lists.append(temp_list)

# 5. Removing the "non-geographic" Local authority area
neighborhoods_rows = []
for lst in neighborhoods_lists:
    if "non-geographic" in lst[1]:
        continue
    else:
        lst[2] = re.sub(" /", ",", lst[2])
        neighborhoods_rows.append(lst)

# 6. Converting the lists into a NumPy array and separates the columns from the values
neighborhoods_rows = np.array(neighborhoods_rows)
neighborhoods_columns = neighborhoods_rows[0,:]
neighborhoods_values = neighborhoods_rows[1:,:]

# 7. Creating a DataFrame of the neighborhoods in Newcastle
neighborhoods_df = pd.DataFrame(
    neighborhoods_values,
    columns = neighborhoods_columns
).rename(columns={"Postcode district": "PostalCode"}).sort_values(by=["PostalCode"])

# 8. Printing the number of rows in the DataFrame
print("This DataFrame contains {} rows.".format(neighborhoods_df.shape[0]))

neighborhoods_df.head()

In [None]:

# 1. Loading the coordinates csv into notebook
coordinates_df = pd.read_csv("Files/Geospatial_Coordinates.csv").rename(columns={"Postcode district": "PostalCode"})

# 2. Merging neighborhoods_df with coordinates_df
neighborhoods_df = pd.merge(neighborhoods_df, coordinates_df)

neighborhoods_df.head()

In [None]:
from pandas.io.json import json_normalize

# 1. Defining Foursquare credentials and version
CLIENT_ID = "ITYEMLDDKSZTIRSHWI2SXOPCGFJ3AWQY5JKF3PZVKD5EJLIP"
CLIENT_SECRET = "MFSJZJIIEGJPW2H1TLX1ZEKPACYPWS3KRIOK3HNO3ARJIBQM"
VERSION = "20200504"
radius = 500
limit = 100

# 2. Setting the neighborhood's coordinates
neighborhood_name = neighborhoods_df.loc[0, "Neighborhood"]
neighborhood_latitude = neighborhoods_df.loc[0, "Latitude"]
neighborhood_longitude = neighborhoods_df.loc[0, "Longitude"]

# 3. Creating URL and sends the GET request
url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
    CLIENT_ID,
    CLIENT_SECRET,
    VERSION,
    neighborhood_latitude,
    neighborhood_longitude,
    radius,
    limit
)
results = requests.get(url).json()

# 4. Getting the items in JSON
venues = results["response"]["groups"][0]["items"]

# 5. Converting JSON into a DataFrame
venues_df = json_normalize(venues)

# 6. Filtering the columns
filtered_columns = ["venue.name", "venue.categories", "venue.location.lat", "venue.location.lng"]
venues_df = venues_df.loc[:, filtered_columns]

# 7. Returning the value of the name key in venue.categories
venues_df["category"] = venues_df["venue.categories"].apply(lambda x: x[0]["name"])

# 8. Renaming the columns
venues_df.columns = [x.split(".")[-1] for x in venues_df.columns]

# 9. Printing the number of venues returned by Foursquare
print("{} venues were returned by Foursquare.".format(venues_df.shape[0]))

# 10. Adding the neighborhood to each row
venues_df["neighborhood"] = neighborhood_name

# 11. Setting neighborhood as first column
venues_df = venues_df[["neighborhood", "name", "category", "lat", "lng"]]

venues_df

In [None]:
# 1. Assinging necessary column values to variables
neighborhood_values = neighborhoods_df["Neighborhood"].values
latitude_values = neighborhoods_df["Latitude"].values
longitude_values = neighborhoods_df["Longitude"].values

# 2. Sends the GET results for each neighborhood in the DataFrame and appends its JSON in all_venues
all_results = []
all_venues_neighborhoods = []
for hood, lat, lng in zip(neighborhood_values, latitude_values, longitude_values):
    
    # 2.1. Generates the URL
    temp_url = "https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit{}".format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, limit)
    
    # 2.2. Sends the GET request
    temp_results = requests.get(temp_url).json()["response"]["groups"][0]["items"]
    
    # 2.3. Appends the results to all_venues
    all_results.append(temp_results)
    
    # 2.4. Generates a temporary DataFrame in order to get the number of rows
    temp_df = json_normalize(temp_results)
    
    # 2.5. Appends the neighborhood of each row in the temporary DataFrame
    for i in range(temp_df.shape[0]):
        all_venues_neighborhoods.append(hood)

print("Number of results: {}".format(len(all_results)))
print("Number of venues: {}".format(len(all_venues_neighborhoods)))

# 3. Converting each JSON into a DataFrame and appends it to all_venues_df
all_venues_df = pd.DataFrame()
for result in all_results:
    all_venues_df = all_venues_df.append(json_normalize(result), sort=False)

# 4. Formatting the DataFrame
all_venues_df["Category"] = all_venues_df["venue.categories"].apply(lambda x: x[0]["name"])
all_venues_df["Neighborhood"] = all_venues_neighborhoods
filtered_columns = ["Neighborhood", "venue.name", "Category", "venue.location.lat", "venue.location.lng"]
all_venues_df = all_venues_df.loc[:, filtered_columns].reset_index(drop=True)
all_venues_df.columns = ["Neighborhood", "Venue", "Venue Category", "Venue Latitude", "Venue Longitude"]

all_venues_df.head()

In [None]:
# 5. Printing the number of unique venue categories in Ncl neighborhoods
print("There are {} unique categories".format(len(all_venues_df["Venue Category"].unique())))

# 6. Returning the top 5 neighborhoods with the most venues
all_venues_df.groupby("Neighborhood")["Venue"].count().reset_index().sort_values(by=["Venue"], ascending=False).reset_index(drop=True).head()

In [None]:
# 1. Splitting the values in Venue Category into dummy variables
ncl_dummies = pd.get_dummies(
    all_venues_df["Venue Category"],
    prefix = "",
    prefix_sep = ""
)

# 2. Adding neighborhood as first column into dummies DataFrame
ncl_dummies["neighborhood"] = all_venues_df["Neighborhood"]
fixed_columns = [ncl_dummies.columns[-1]] + list(ncl_dummies.columns[:-1])
ncl_dummies = ncl_dummies[fixed_columns]

print(ncl_dummies.shape)
ncl_dummies.head()

In [None]:
# 3. Grouping rows by each neighborhood
ncl_grouped = ncl_dummies.groupby("neighborhood").sum().reset_index()

# 5. Creating a DataFrame containing the most common venues in each neighborhood
ncl_common_venues = pd.DataFrame()
for hood in ncl_grouped["neighborhood"]:
    temp_df = ncl_grouped[ncl_grouped["neighborhood"] == hood].T.reset_index()
    temp_df.columns = ["Venue", "Count"]
    temp_df = temp_df.loc[1:]
    temp_df = temp_df[temp_df["Count"] > 0]
    temp_df = temp_df.sort_values(by=["Count"], ascending=False).reset_index(drop=True)
    temp_df["Most Common"] = [i+1 for i in range(temp_df.shape[0])]
    temp_df["Neighborhood"] = hood
    temp_df = temp_df[["Neighborhood", "Most Common", "Venue", "Count"]]
    ncl_common_venues = ncl_common_venues.append(temp_df)
ncl_common_venues.reset_index(drop=True)

ncl_common_venues.head()

In [None]:
# 6. Creating a pivot table where each row shows the top 10 most common venues for each neighborhood
ncl_common_venues_pivot = ncl_common_venues.pivot(
    columns = "Most Common",
    index = "Neighborhood",
    values = "Venue"
).fillna("")

ncl_common_venues_pivot.head()

In [None]:
from sklearn.cluster import KMeans

# 1. Setting the number of clusters
clusters = 4

# 2. Dropping the neighborhood column
ncl_grouped_clustering = ncl_grouped.drop("neighborhood", axis=1)

# 3. Running K-Mean Clustering
kmeans = KMeans(n_clusters=clusters, random_state=0).fit(ncl_grouped_clustering)

print(kmeans.labels_)

# 4. Adding clustering labels to ncl_common_venues_pivot
#ncl_common_venues_pivot.insert(0, "Cluster Labels", kmeans.labels_)

# 5. Merging neighborhoods_df with ncl_common_venues_pivot
ncl_common_venues_df = pd.DataFrame(ncl_common_venues_pivot.to_records())
ncl_merged = neighborhoods_df.join(ncl_common_venues_df.set_index("Neighborhood"), on="Neighborhood")
ncl_merged["Cluster Labels"] = ncl_merged["Cluster Labels"].fillna(4).astype("int")

print(ncl_merged.shape[0])
ncl_merged.head()

In [None]:
def get_top_10_venues(dataframe, cluster):
    temp_columns = ["Neighborhood", "Cluster Labels"] + [i for i in dataframe.columns[6:]]
    temp_df1 = dataframe[dataframe["Cluster Labels"] == cluster-1][temp_columns]
    temp_array = temp_df1.values[:,2:]
    temp_list = []
    for neighborhood in temp_array:
        for venue in neighborhood:
            if not venue == "":
                temp_list.append(venue)
    temp_df2 = pd.DataFrame({
        "id": [i for i in range(len(temp_list))],
        "venue": temp_list
    })
    temp_df2 = temp_df2.groupby("venue").id.count().reset_index()
    temp_df2 = temp_df2.sort_values(by="id", ascending=False).reset_index(drop=True)
    temp_df2 = temp_df2.rename(columns={
        "id": "Count",
        "venue": "Venue"
    })
    if temp_df1.shape[0] == 1:
        print("This cluster counts 1 neighborhood.")
    else:
        print("This cluster counts {} neighborhoods.".format(temp_df1.shape[0]))
    return temp_df2.head(10)

In [None]:
get_top_10_venues(ncl_merged, 1)

In [None]:
get_top_10_venues(ncl_merged, 2)

In [None]:
get_top_10_venues(ncl_merged, 3)