In [None]:
import random
import math

import matplotlib.pyplot as plt
import numpy as np
import community as community_louvain
import pandas as pd
import networkx as nx

import matplotlib.patches as mpatches

from collections import defaultdict, Counter
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.preprocessing import MultiLabelBinarizer
from colorama import Fore, Style



# PROJECT 8: TripAdvisor European restaurants

In [None]:
# General parameters 

number_of_nodes = 1000 # Number of nodes to work with for the network (between 0 and 1000000, between 1000 and 10000 recommanded, depending of the computer)
same_dataset = False # Activate it to use the same dataset for all analysis (recommanded to let it on True)
random_state = 88 # Change to test an other subset

#### 1: Graph Construction

In [None]:
# reading data

columns_to_keep = [
    'restaurant_name',
    'restaurant_link',
    'original_location',
    'country',
    'cuisines',
    'special_diets',
    'features',
    'latitude',
    'longitude',
    'popularity_generic',
    'total_reviews_count',
    'avg_rating',
    'keywords',
    'vegetarian_friendly',
    'vegan_options',
    'gluten_free',
    'awards'
]

df = pd.read_csv("tripadvisor_european_restaurants.csv")[columns_to_keep]
df.head()

In [None]:
# Put list of element into list

def split_string_to_list(s):
    if isinstance(s, str):
        return [item.strip() for item in s.split(",")]
    return []

for column in ["cuisines", "special_diets", "features", "keywords"]:
    df[column] = df[column].apply(split_string_to_list)

In [None]:
# Sampling data

if same_dataset :
    df_keywords = df[df['keywords'].apply(lambda x: isinstance(x, list) and len(x) > 0)].reset_index(drop=True)
    df_sample = df_keywords.sample(n=number_of_nodes, random_state=random_state).reset_index(drop=True)
else:
    df_sample = df.sample(n=number_of_nodes, random_state=random_state).reset_index(drop=True)


In [None]:
df_sample.head()

In [None]:
# Construct the graph

G = nx.Graph()

# Adding nodes
for idx, row in df_sample.iterrows():
    G.add_node(idx, name=row['restaurant_name'], city=row['original_location'].replace("[", "").replace("]", "").replace(' "', "").replace('"', "").split(",")[-1] # corresponding to the city
               , cuisines=row['cuisines'], special_diets=row['special_diets'], features=row.get('features', []), popularity_generic=row['popularity_generic'], total_reviews_count=row['total_reviews_count'], 
               avg_rating=row['avg_rating'], keywords=row['keywords'], vegetarian_friendly=row['vegetarian_friendly'], vegan_options=row['vegan_options'], gluten_free=row['gluten_free'])

index = defaultdict(set)

for idx, row in df_sample.iterrows():
    tags = row['cuisines'] + row['special_diets'] + row.get('features', []) + [row['original_location'].replace("[", "").replace("]", "").replace('"', "").replace(" ", "").split(",")[-1]]
    for tag in tags:
        index[tag].add(idx)

# Adding edges
added_edges = set()

for idx, row in tqdm(df_sample.iterrows(), total=len(df_sample), desc="Building graph"):
    tags = row['cuisines'] + row['special_diets'] + row.get('features', []) + [row['original_location'].replace("[", "").replace("]", "").replace('"', "").replace(" ", "").split(",")[-1]]
    neighbors = set()
    for tag in tags:
        neighbors.update(index[tag])
    neighbors.discard(idx)

    for neighbor in neighbors:
        if (neighbor, idx) in added_edges or (idx, neighbor) in added_edges:
            continue

        r2 = df_sample.loc[neighbor]

        shared = len(set(row['cuisines']) & set(r2['cuisines']))
        if shared == 0:
            shared += len(set(row['special_diets']) & set(r2['special_diets']))
            if shared == 0:    
                shared += len(set(row.get('features', [])) & set(r2.get('features', [])))
                if shared == 0:
                    shared += 1 if row['original_location'].replace("[", "").replace("]", "").replace('"', "").replace(" ", "").split(",")[-1] == r2['original_location'].replace("[", "").replace("]", "").replace('"', "").replace(" ", "").split(",")[-1] else 0
        if shared > 0:
            G.add_edge(idx, neighbor)
            added_edges.add((idx, neighbor))

#### 2: Weighted Network Creation

In [None]:
def haversine(lat1, lon1, lat2, lon2):
    """Calculate the haversine distance based on lattitude and longitude"""
    R = 6371
    phi1, phi2 = math.radians(lat1), math.radians(lat2)
    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)
    
    # passing from circular coordinates to carthesian coordinates 
    a = math.sin(delta_phi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return R * c

def add_weighted_edges(G, df_sample):
    """Adding weighted edge to the graph based on a similarity score. A similarity score maximum :
        - two restaurant at the same place and with exactly same features will gave a weight of 0
        - two restaurants separated by a distance above 5km and without any common features will gave a weight of 2
        - common cuisines, special_diets and features are adding to the weight a value between 0 and 1
        - distance add to weight a value between 0 (distance of 0) and 1 (distance over 5km)"""
    for idx1, row1 in tqdm(df_sample.iterrows(), total=len(df_sample), desc="Building graph"):
        for idx2, row2 in df_sample.iterrows():
            if idx1 >= idx2:
                continue
            
            # Attribute a similarity score base on shared tags from two restaurants and geographic proximity

            common_cuisines = set(row1['cuisines']).intersection(row2['cuisines']) # count number of "cuisines" common tags
            common_diets = set(row1['special_diets']).intersection(row2['special_diets'])  # count number of "special_diets" common tags
            common_features = set(row1['features']).intersection(row2['features'])  # count number of "features" common tags

            similarity_score = len(common_cuisines) + len(common_diets) + len(common_features) # calculate a prliminary score

            max_similarities = max(len(row1['cuisines']), len(row2['cuisines'])) + max(len(row1['special_diets']), len(row2['special_diets'])) + max(len(row1['features']), len(row2['features']))

            if max_similarities == 0:
                weight = 1 # the case of there are no similarities possible because some informations are missing
            else:
                weight = (max_similarities - similarity_score) / max_similarities

            # increase the similarity score if restaurants are close (less than 5km between both two)

            distance = haversine(row1['latitude'], row1['longitude'], row2['latitude'], row2['longitude'])
            
            if G.has_edge(idx1, idx2):
                distance = 4.99 # if restaurant are in the same city but we can't calculate the haversine distance or if the haversine distance is over 5km, set the distance to the maximum sub 5
            
            elif str(distance) == "nan":
                distance = 5 # to avoid problems with restaurants unlocated

            if distance < 5:
                weight = weight +  (distance / 5) # if restaurant are close, attribute score is increased with a ponderation of this distance
            else:
                weight = weight + 1
            
            if weight == 0:
                weight = 0.0001 # to avoid problems with a weight of 0

            if weight < 2:
                G.add_edge(idx1, idx2, weight=weight, distance=distance) # for the recommandation algorithm is better to have the distance
    
    return G

G = add_weighted_edges(G, df_sample)

#### 3: Degree and Strength Analysis

In [None]:
# Calculate the degree (number of edge)
degree = dict(G.degree())

# Calculate the ponderate degree (sum of weigth of edges)
weighted_degree = dict(G.degree(weight='weight'))

# Plot the 10 most connected restaurants based on the degree
print("Top 10 most connected restaurants based on the degree :")
top_degree = sorted(degree.items(), key=lambda x: x[1], reverse=True)[:10]
for id, deg in top_degree:
    print(f"{G.nodes[id]["name"]}: {deg} ")

# Plot the 10 most connected restaurants based on the ponderated degree
print("\nTop 10 most connected restaurants based on the pondered degree :")
top_weighted_degree = sorted(weighted_degree.items(), key=lambda x: x[1], reverse=True)[:10]
for id, weighted_deg in top_weighted_degree:
    print(f"{G.nodes[id]["name"]}: {weighted_deg}")

In [None]:
# this time, the ranking is based on the best ration weight / nb_of_edges

# Calculating ratio only for nodes with more than n edges

n_egdes = 100

average_weight_ratio = {}
for node in G.nodes():
    deg = degree.get(node, 0)
    weighted_deg = weighted_degree.get(node, 0)
    if deg > n_egdes:
        average_weight_ratio[node] = weighted_deg / deg

print("\nTop 10 most connected restaurants based on the best ratio total_weight / nb_of_edge :")
lowest_ratio = sorted(average_weight_ratio.items(), key=lambda x: x[1])[:10]
for id, ratio in lowest_ratio:
    deg = degree.get(id, 0)
    print(f"{id}, {G.nodes[id]['name']}: {ratio:.4f} ; {G.nodes[id]['city']} ; deg={deg}")

#### 4: Centrality Metrics

In [None]:
# Centrality measures
degree_centrality = nx.degree_centrality(G)
print("degree_centrality done")

In [None]:
closeness_centrality = nx.closeness_centrality(G, distance='weight')
print("closeness_centrality done")
betweenness_centrality = nx.betweenness_centrality(G, weight='weight')
print("betweenness_centrality done")
eigenvector_centrality = nx.eigenvector_centrality(G, weight='weight', max_iter=1000)
print("eigenvector_centrality done")

In [None]:
# Convert to sorted lists
def top_n(dic, n=10):
    return sorted(dic.items(), key=lambda x: x[1], reverse=True)[:n]

print("Top 10 restaurants by degree centrality:")
for node, score in top_n(degree_centrality):
    print(f"{G.nodes[node]['name']}: {score:.4f}")

In [None]:
print("\nTop 10 restaurants by closeness centrality:")
for node, score in top_n(closeness_centrality):
    print(f"{G.nodes[node]['name']}: {score:.4f}")

In [None]:
print("\nTop 10 restaurants by betweenness centrality:")
for node, score in top_n(betweenness_centrality):
    print(f"{G.nodes[node]['name']}: {score:.4f}")

In [None]:
print("\nTop 10 restaurants by eigenvector centrality:")
for node, score in top_n(eigenvector_centrality):
    print(f"{G.nodes[node]['name']}: {score:.4f}")

#### 5: Popularity vs Centrality Correlation

In [None]:
# Correlation with popularity_generic:

# Reorganization of popularity generic, by taking the rank (between -1 and 1, -1 for no ranking, 0 for a good one and 1 for a bad one)

ranking = []
for i in range(len(G.nodes())):
    raw_rank = G.nodes[i]["popularity_generic"]
    if str(raw_rank) == "nan":
        ranking.append(-1)
    else:
        raw_rank = raw_rank.split(" ")
        rank = int(raw_rank[0].replace("#", ' '))
        max_rank = int(raw_rank[2])
        ranking.append(rank/max_rank)

# Plotting correlation between popularity generic and centrality

fig, axes = plt.subplots(1, 4, figsize=(18, 5))

axes[0].scatter(pd.Series(degree_centrality), ranking, alpha=0.5)
axes[0].set_title("Degree vs Popularity")
axes[0].set_xlabel("Degree Centrality")
axes[0].set_ylabel("Popularity")
axes[0].grid(False)

axes[1].scatter(pd.Series(closeness_centrality), ranking, alpha=0.5)
axes[1].set_title("Closeness vs Popularity")
axes[1].set_xlabel("Closeness Centrality")
axes[1].set_ylabel("Popularity")
axes[1].grid(False)

axes[2].scatter(pd.Series(betweenness_centrality), ranking, alpha=0.5)
axes[2].set_title("Betweenness vs Popularity")
axes[2].set_xlabel("Betweenness Centrality")
axes[2].set_ylabel("Popularity")
axes[2].grid(False)

axes[3].scatter(pd.Series(eigenvector_centrality), ranking, alpha=0.5)
axes[3].set_title("Eigenvector vs Popularity")
axes[3].set_xlabel("Eigenvector Centrality")
axes[3].set_ylabel("Popularity")
axes[3].grid(False)

plt.show


In [None]:
# Correlation with total_reviews_count:

ranking = []
for i in range(len(G.nodes())):
    ranking.append(G.nodes[i]["total_reviews_count"])

# Plotting correlation between total_reviews_count and centrality

fig, axes = plt.subplots(1, 4, figsize=(18, 5))

axes[0].scatter(pd.Series(degree_centrality), ranking, alpha=0.5)
axes[0].set_title("Degree vs Total Review")
axes[0].set_xlabel("Degree Centrality")
axes[0].set_ylabel("total_reviews_count")
axes[0].grid(False)

axes[1].scatter(pd.Series(closeness_centrality), ranking, alpha=0.5)
axes[1].set_title("Closeness vs Total Review")
axes[1].set_xlabel("Closeness Centrality")
axes[1].set_ylabel("total_reviews_count")
axes[1].grid(False)

axes[2].scatter(pd.Series(betweenness_centrality), ranking, alpha=0.5)
axes[2].set_title("Betweenness vs Total Review")
axes[2].set_xlabel("Betweenness Centrality")
axes[2].set_ylabel("total_reviews_count")
axes[2].grid(False)

axes[3].scatter(pd.Series(eigenvector_centrality), ranking, alpha=0.5)
axes[3].set_title("Eigenvector vs Total Review")
axes[3].set_xlabel("Eigenvector Centrality")
axes[3].set_ylabel("total_reviews_count")
axes[3].grid(False)

plt.show


In [None]:
# Correlation with popularity generic:

ranking = []
for i in range(len(G.nodes())):
    ranking.append(G.nodes[i]["avg_rating"])

# Plotting correlation between popularity generic and centrality

fig, axes = plt.subplots(1, 4, figsize=(18, 5))

axes[0].scatter(pd.Series(degree_centrality), ranking, alpha=0.5)
axes[0].set_title("Degree vs Average Rating")
axes[0].set_xlabel("Degree Centrality")
axes[0].set_ylabel("Average Rating")
axes[0].grid(False)

axes[1].scatter(pd.Series(closeness_centrality), ranking, alpha=0.5)
axes[1].set_title("Closeness vs Average Rating")
axes[1].set_xlabel("Closeness Centrality")
axes[1].set_ylabel("Average Rating")
axes[1].grid(False)

axes[2].scatter(pd.Series(betweenness_centrality), ranking, alpha=0.5)
axes[2].set_title("Betweenness vs Average Rating")
axes[2].set_xlabel("Betweenness Centrality")
axes[2].set_ylabel("Average Rating")
axes[2].grid(False)

axes[3].scatter(pd.Series(eigenvector_centrality), ranking, alpha=0.5)
axes[3].set_title("Eigenvector vs Average Rating")
axes[3].set_xlabel("Eigenvector Centrality")
axes[3].set_ylabel("Average Rating")
axes[3].grid(False)

plt.show


#### 6: Community Detection (Unweighted)

In [None]:
# Apply Louvain on G (non-weighted)
G_unweighted = G.copy()
for u, v, d in G_unweighted.edges(data=True):
    d.pop('weight', None)

partition = community_louvain.best_partition(G_unweighted)
nx.set_node_attributes(G_unweighted, partition, "community")

# Print statistics
from collections import Counter
comm_counter = Counter(partition.values())
print(f"Number of detected communities: {len(comm_counter)}")
print(f"Length of biggest communities : {comm_counter.most_common(5)}")

In [None]:
community_ids = [G_unweighted.nodes[n]["community"] for n in G_unweighted.nodes()]
num_comms = max(community_ids) + 1
cmap = plt.cm.get_cmap('nipy_spectral', num_comms)
pos = nx.spring_layout(G_unweighted, k=1)

plt.figure(figsize=(20, 20))
nodes = nx.draw_networkx_nodes(
    G_unweighted,
    pos,
    node_color=community_ids,
    cmap=cmap,
    node_size=30
)
nx.draw_networkx_edges(G_unweighted, pos, alpha=0.05, width=0.2)

plt.title("Communities colored by Louvain ID")
plt.axis("off")

cbar = plt.colorbar(nodes, ticks=range(num_comms))
cbar.set_label("Community ID")

plt.show()

In [None]:
community_ids = [G_unweighted.nodes[n]["community"] for n in G_unweighted.nodes()]
num_comms = max(community_ids) + 1
cmap = plt.cm.get_cmap('nipy_spectral', num_comms)
pos = nx.spring_layout(G_unweighted, k=1)

plt.figure(figsize=(20, 20))
nodes = nx.draw_networkx_nodes(
    G_unweighted,
    pos,
    node_color=community_ids,
    cmap=cmap,
    node_size=30
)
#nx.draw_networkx_edges(G_unweighted, pos, alpha=0.05, width=0.2)

plt.title("Communities colored by Louvain ID")
plt.axis("off")

cbar = plt.colorbar(nodes, ticks=range(num_comms))
cbar.set_label("Community ID")

plt.show()

In [None]:
def summarize_community_features(G_unweighted, community_id, top_n=3):
    nodes_in_community = [n for n, d in G_unweighted.nodes(data=True) if d["community"] == community_id]
    cuisines = []
    diets = []
    features = []

    for n in nodes_in_community:
        cuisines.extend(G_unweighted.nodes[n].get("cuisines", []))
        diets.extend(G_unweighted.nodes[n].get("special_diets", []))
        features.extend(G_unweighted.nodes[n].get("features", []))

    print(f"\nCommunity {community_id} - {len(nodes_in_community)} restaurants")
    print("Top cuisines:", Counter(cuisines).most_common(top_n))
    print("Top diets:", Counter(diets).most_common(top_n))
    print("Top features:", Counter(features).most_common(top_n))

top_3 = [comm for comm, _ in comm_counter.most_common(5)]
for comm_id in top_3:
    summarize_community_features(G_unweighted, comm_id)


#### 7: Community Detection (Weighted)

In [None]:
# Apply Louvain on G

partition = community_louvain.best_partition(G)
nx.set_node_attributes(G, partition, "community")

# Print statistics
from collections import Counter
comm_counter = Counter(partition.values())
print(f"Number of detected communities: {len(comm_counter)}")
print(f"Length of biggest communities : {comm_counter.most_common(5)}")

In [None]:
community_ids = [G.nodes[n]["community"] for n in G.nodes()]
num_comms = max(community_ids) + 1
cmap = plt.cm.get_cmap('nipy_spectral', num_comms)
pos = nx.spring_layout(G, k=1)

plt.figure(figsize=(20, 20))
nodes = nx.draw_networkx_nodes(
    G,
    pos,
    node_color=community_ids,
    cmap=cmap,
    node_size=30
)
nx.draw_networkx_edges(G, pos, alpha=0.05, width=0.2)

plt.title("Communities colored by Louvain ID")
plt.axis("off")

cbar = plt.colorbar(nodes, ticks=range(num_comms))
cbar.set_label("Community ID")

plt.show()

In [None]:
community_ids = [G.nodes[n]["community"] for n in G.nodes()]
num_comms = max(community_ids) + 1
cmap = plt.cm.get_cmap('nipy_spectral', num_comms)
pos = nx.spring_layout(G, k=1)

plt.figure(figsize=(20, 20))
nodes = nx.draw_networkx_nodes(
    G,
    pos,
    node_color=community_ids,
    cmap=cmap,
    node_size=30
)
# nx.draw_networkx_edges(G, pos, alpha=0.05, width=0.2)

plt.title("Communities colored by Louvain ID")
plt.axis("off")

cbar = plt.colorbar(nodes, ticks=range(num_comms))
cbar.set_label("Community ID")

plt.show()

In [None]:
def summarize_community_features(G, community_id, top_n=3):
    nodes_in_community = [n for n, d in G.nodes(data=True) if d["community"] == community_id]
    cuisines = []
    diets = []
    features = []

    for n in nodes_in_community:
        cuisines.extend(G.nodes[n].get("cuisines", []))
        diets.extend(G.nodes[n].get("special_diets", []))
        features.extend(G.nodes[n].get("features", []))

    print(f"\nCommunity {community_id} - {len(nodes_in_community)} restaurants")
    print("Top cuisines:", Counter(cuisines).most_common(top_n))
    print("Top diets:", Counter(diets).most_common(top_n))
    print("Top features:", Counter(features).most_common(top_n))

top = [comm for comm, _ in comm_counter.most_common(5)]
for comm_id in top:
    summarize_community_features(G, comm_id)


#### 8 : Role of Dietary Preferences

In [None]:
# Filtering nodes with diet tags

diet_tags = {"Vegetarian Friendly", "Vegan Options", "Gluten Free Options"}

diet_nodes = [node for node in G.nodes if any(tag in G.nodes[node]['special_diets'] for tag in diet_tags)]

non_diet_nodes = list(set(G.nodes()) - set(diet_nodes))

# Degrees

diet_degrees = [degree_centrality[n] for n in diet_nodes]
non_diet_degrees = [degree_centrality[n] for n in non_diet_nodes]

# Community inclusion

diet_comms = [G.nodes[n]['community'] for n in diet_nodes if 'community' in G.nodes[n]]
non_diet_comms = [G.nodes[n]['community'] for n in non_diet_nodes if 'community' in G.nodes[n]]


In [None]:
# Printing results

avg_degree_diet_degrees = np.average(diet_degrees)
avg_degree_non_diet_degrees = np.average(non_diet_degrees)

med_degree_diet_degrees = np.median(diet_degrees)
med_degree_non_diet_degrees = np.median(non_diet_degrees)

avg_comms_diet_comms = np.average(diet_comms)
avg_comms_non_diet_comms = np.average(non_diet_comms)

med_comms_diet_comms = np.median(diet_comms)
med_comms_non_diet_comms = np.median(non_diet_comms)

print(f"Average degree centrality for diet nodes: {avg_degree_diet_degrees}")
print(f"Average degree centrality for non diet nodes: {avg_degree_non_diet_degrees}")
print(f"Median degree centrality for diet nodes: {med_degree_diet_degrees}")
print(f"Mediam degree centrality for non diet nodes: {med_degree_non_diet_degrees}\n")

print(f"Average community inclusion for diet nodes: {avg_comms_diet_comms}")
print(f"Average community inclusion for non diet nodes: {avg_comms_non_diet_comms}")
print(f"Median community inclusion for diet nodes: {med_comms_diet_comms}")
print(f"Mediam community inclusion for non diet nodes: {med_comms_non_diet_comms}\n")

print(f"Number of diet nodes: {len(diet_nodes)}")
print(f"Number of non diet nodes: {len(non_diet_nodes)}")

In [None]:
# Subgraphs
diet_subgraph = G.subgraph(diet_nodes)
non_diet_subgraph = G.subgraph(non_diet_nodes)

# Density
diet_density = nx.density(diet_subgraph)
non_diet_density = nx.density(non_diet_subgraph)

# Printing
print(f"Density of diet subgraph: {diet_density:.4f}")
print(f"Density of non-diet subgraph: {non_diet_density:.4f}")


#### 9 : Subnetwork Analysis by Region

In [None]:
# Listing cities and countries

cities = []
for element in np.array(df_sample['original_location']):
    cities.append(element.replace("[", "").replace("]", "").replace('"', "").replace(" ", "").split(",")[-1])

countries = []
for element in df_sample['country']:
    countries.append(element)

# Counting restaurants in every city and country

n_most_common = 10 # change for a larger analyse

city_counts = Counter(cities)
top_cities = city_counts.most_common(n_most_common)

country_counts = Counter(countries)
top_countries = country_counts.most_common(n_most_common)

In [None]:
print(top_cities)
print(top_countries)

In [None]:
# Creating subgraphs for top cities (city with the most restaurants)

city_stats = []

for city, _ in top_cities:
    city_nodes = [i for i, loc in enumerate(df_sample['original_location']) if city in loc]
    subG = G.subgraph(city_nodes)
    
    if len(subG) < 2:
        continue

    if nx.is_connected(subG):
        diameter = nx.diameter(subG)
    else:
        largest_cc = max(nx.connected_components(subG), key=len)
        diameter = nx.diameter(subG.subgraph(largest_cc))
    
    stats = {
        "city": city,
        "n_restaurants": len(subG),
        "density": nx.density(subG),
        "diameter": diameter,
        "clustering": nx.average_clustering(subG),
    }
    city_stats.append(stats)
stats_df = pd.DataFrame(city_stats)
display(stats_df)

In [None]:
# Creating subgraphs for top countries (country with the most restaurants)

country_stats = []

for country, _ in top_countries:
    country_nodes = [i for i, loc in enumerate(df_sample['country']) if country in loc]
    subG = G.subgraph(country_nodes)
    
    if len(subG) < 2:
        continue

    if nx.is_connected(subG):
        diameter = nx.diameter(subG)
    else:
        largest_cc = max(nx.connected_components(subG), key=len)
        diameter = nx.diameter(subG.subgraph(largest_cc))
    
    stats = {
        "country": country,
        "n_restaurants": len(subG),
        "density": nx.density(subG),
        "diameter": diameter,
        "clustering": nx.average_clustering(subG),
    }
    country_stats.append(stats)

stats_df = pd.DataFrame(country_stats)
display(stats_df)

In [None]:
# Printing global informations
print(f"n_restaurants: {len(G)} ; density {nx.density(G)} ; clustering {nx.average_clustering(G)}")

#### 10 : Visualization of the Global Network

In [None]:
node_colors = [G.nodes[n]["community"] for n in G.nodes()]
node_sizes = [df_sample.loc[n, "total_reviews_count"] if n in df_sample.index else 1 for n in G.nodes]
edge_widths  = [(2 - G[u][v].get("weight", 1)) / 5 for u, v in G.edges]

# Normalizing sizes
sizes = np.array(node_sizes)
sizes = 100 + 500 * (sizes - sizes.min()) / (sizes.max() - sizes.min() + 1e-5)
pos = nx.spring_layout(G, seed=42, k=1.5)
cmap = plt.cm.get_cmap('nipy_spectral', num_comms)

plt.figure(figsize=(100, 100))
nodes = nx.draw_networkx_nodes(G, pos,
                               node_color=node_colors,
                               node_size=node_sizes,
                               cmap=cmap,
                               alpha=0.8)
nx.draw_networkx_edges(G, pos, width=edge_widths, alpha=0.4)

plt.title("Global Network (color = community / size = reviews / edge width = weight)", fontsize=14)
plt.axis("off")
plt.colorbar(nodes, label="Community ID")
plt.show()

In [None]:
node_colors = [G.nodes[n]["community"] for n in G.nodes()]
node_sizes = [df_sample.loc[n, "total_reviews_count"] if n in df_sample.index else 1 for n in G.nodes]
edge_widths  = [(2 - G[u][v].get("weight", 1)) / 5 for u, v in G.edges]

# Normalizing sizes
sizes = np.array(node_sizes)
sizes = 100 + 500 * (sizes - sizes.min()) / (sizes.max() - sizes.min() + 1e-5)
pos = nx.spring_layout(G, seed=42, k=1.5)
cmap = plt.cm.get_cmap('nipy_spectral', num_comms)

plt.figure(figsize=(100, 100))
nodes = nx.draw_networkx_nodes(G, pos,
                               node_color=node_colors,
                               node_size=node_sizes,
                               cmap=cmap,
                               alpha=0.8)
#nx.draw_networkx_edges(G, pos, width=edge_widths, alpha=0.4)

plt.title("Global Network  (color = community / size = reviews / without edges)", fontsize=14)
plt.axis("off")
plt.colorbar(nodes, label="Community ID")
plt.show()

#### 11 : Keyword Similarity Network

In [None]:
# USELESS IF same_data = True

# For this task, as the "keyword" column is empty for a lot of restaurants, we will create a new subdataset with only restaurants which have a not-empty "keywords" column
if same_dataset != True:
    df_keywords = df[df['keywords'].apply(lambda x: isinstance(x, list) and len(x) > 0)].copy().reset_index(drop=True)
    df_keyword_subset = df_keywords.sample(n=number_of_nodes, random_state=random_state).copy().reset_index(drop=True)

In [None]:
# Creating the vector of keywords

df_keyword_subset['keywords_str'] = df_keyword_subset['keywords'].apply(lambda x: ' '.join(x))
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df_keyword_subset['keywords_str'])

In [None]:
# Cosine-similarity matrix
cos_sim_matrix = cosine_similarity(X)

In [None]:
# Define the threeshold

similarity_threshold = 0.5

# graph construction
G_keywords = nx.Graph()
G_keywords.add_nodes_from(df_keyword_subset.index)

# Adding edges
for i in range(len(df_keyword_subset)):
    for j in range(i + 1, len(df_keyword_subset)):
        sim = cos_sim_matrix[i, j]
        if sim >= similarity_threshold:
            G_keywords.add_edge(i, j, weight=sim)



In [None]:

print(f"Number of nodes : {G_keywords.number_of_nodes()}")
print(f"Number of edges : {G_keywords.number_of_edges()}")
print(f"Density : {nx.density(G_keywords):.4f}")
print(f"Average clustering coefficient : {nx.average_clustering(G_keywords):.4f}")

In [None]:
# Louvain-communities detection
partition = community_louvain.best_partition(G_keywords, weight='weight')
nx.set_node_attributes(G_keywords, partition, 'community')

# Communities Analysis
comm_counter = Counter(partition.values())
print(f"Number of communities (desc similarity): {len(comm_counter)}")
print(f"Top 5 largest communities: {comm_counter.most_common(5)}")

In [None]:
community_ids = [G_keywords.nodes[n]["community"] for n in G_keywords.nodes()]
num_comms = max(community_ids) + 1
cmap = plt.cm.get_cmap('nipy_spectral', num_comms)
pos = nx.spring_layout(G_keywords, k = 0.5)

plt.figure(figsize=(24, 20))
nodes = nx.draw_networkx_nodes(
    G_keywords,
    pos,
    node_color=community_ids,
    cmap=cmap,
    node_size=30
)

nx.draw_networkx_edges(G_keywords, pos, alpha=0.1, width=0.5)

plt.title("Communities colored by Louvain ID")
plt.axis("off")

cbar = plt.colorbar(nodes, ticks=range(num_comms))
cbar.set_label("Community ID")

plt.show()

In [None]:
# Choose number of top community to print and number of keywords

nb_keywords = 5
nb_comm = 9

# define the function

def summarize_community_keywords(G, community_id, top_n=nb_keywords):
    # Summarize top [top_n] community keywords for the community [community_id] defined for G and print it
    nodes_in_community = [n for n, d in G.nodes(data=True) if d["community"] == community_id]
    keywords = []

    for n in nodes_in_community:
        k = G.nodes[n].get("keywords")
        if k:
            keywords.extend(k)

    print(f"\nCommunity {community_id} - {len(nodes_in_community)} restaurants")
    print("Top keywords:", Counter(keywords).most_common(top_n))

# Adding keywords information to the network

for idx in G_keywords.nodes:
    G_keywords.nodes[idx]["keywords"] = df_keyword_subset.iloc[idx]["keywords"]

top = [comm for comm, _ in comm_counter.most_common(nb_comm)]
for comm_id in top:
    summarize_community_keywords(G_keywords, comm_id)

#### 12 : Backbone Extraction

In [None]:
def disparity_filtering(G, alpha_threshold=0.5):
    backbone = nx.Graph()
    for node in G.nodes():
        neighbors = list(G[node])
        k = len(neighbors)
        if k <= 1:
            continue
        strength = 0
        for nbr in neighbors:
            weight = G[node][nbr].get('weight', 1.0)
            strength += weight
        
        for nbr in neighbors:
            w = G[node][nbr].get('weight', 1.0)
            p_ij = w / strength
            alpha = 1 - (1 - p_ij) ** (k - 1)

            if alpha < alpha_threshold:
                backbone.add_edge(node, nbr, weight=w)

    return backbone



# Application of the filter on G (weighted)

backbone_graph = disparity_filtering(G, alpha_threshold=0.5)


In [None]:
# Printing results
print(f"Backbone size: {backbone_graph.number_of_nodes()} nodes, {backbone_graph.number_of_edges()} edges")
print(f"Density: {nx.density(backbone_graph):.4f}")
print(f"Average clustering coefficient: {nx.average_clustering(backbone_graph, weight='weight'):.4f}")

In [None]:
# Louvain-communities detection
partition = community_louvain.best_partition(backbone_graph, weight='weight')
nx.set_node_attributes(backbone_graph, partition, 'community')

# Communities Analysis
comm_counter = Counter(partition.values())
print(f"Number of communities (desc similarity): {len(comm_counter)}")
print(f"Top 5 largest communities: {comm_counter.most_common(5)}")

In [None]:
community_ids = [backbone_graph.nodes[n]["community"] for n in backbone_graph.nodes()]
num_comms = max(community_ids) + 1
cmap = plt.cm.get_cmap('nipy_spectral', num_comms)
pos = nx.spring_layout(backbone_graph, k = 0.5)

plt.figure(figsize=(24, 20))
nodes = nx.draw_networkx_nodes(
    backbone_graph,
    pos,
    node_color=community_ids,
    cmap=cmap,
    node_size=30
)

#nx.draw_networkx_edges(backbone_graph, pos, alpha=0.1, width=0.5)

plt.title("Communities colored by Louvain ID")
plt.axis("off")

cbar = plt.colorbar(nodes, ticks=range(num_comms))
cbar.set_label("Community ID")

plt.show()

#### 13 : Recommendation via Network Proximity

In [None]:
# Choose distance for recommanded restaurants

restaurant_id = 60 # choose restaurant by id of node, put "random" if non restaurant choosed
nb_recommandations = 10

if restaurant_id == "random":
    target_node = random.choice(list(G.nodes))
else:
    target_node = restaurant_id

print(f"Choosen restaurant is {target_node}, {G.nodes[target_node]["name"]} in {G.nodes[target_node]["city"]}")

In [None]:
# extracting path weights

path_weight = {}
for node in tqdm(G.nodes, total=len(G.nodes), desc="Looking to shortest path"):
    if node == target_node:
        continue
    if G.has_edge(target_node, node):
        weight = G[target_node][node]['weight']
    else:
        try:
            weight  = nx.shortest_path_length(G, source=target_node, target=node, weight='weight')
        except nx.NetworkXNoPath:
            continue
    path_weight[node] = weight

In [None]:
# Select best candidates
top_k = min(nb_recommandations, len(path_weight))
top_recommendations = sorted(path_weight.items(), key=lambda x: x[1], reverse=False)[:top_k]
recommended_nodes = [node for node, _ in top_recommendations]

# Comparing shared cuisines
def safe_set(row):
    return set(row) if isinstance(row, list) else set()

target_cuisines = safe_set(df_sample.loc[target_node, 'cuisines'])
target_features = safe_set(df_sample.loc[target_node, 'features'])
target_special_diets = safe_set(df_sample.loc[target_node, 'special_diets'])

shared_cuisines = []
shared_features = []
shared_special_diets = []

for node in recommended_nodes:
    node_cuisines = safe_set(df_sample.loc[node, 'cuisines'])
    node_features = safe_set(df_sample.loc[node, 'features'])
    node_special_diets = safe_set(df_sample.loc[node, 'special_diets'])

    cuisines = target_cuisines.intersection(node_cuisines)
    features = target_features.intersection(node_features)
    special_diets = target_special_diets.intersection(node_special_diets)

    ratio_cuisines, ratio_features, ratio_special_diets = 0, 0, 0

    if len(target_cuisines) > 0:
        ratio_cuisines = len(cuisines) / len(target_cuisines)
    if len(target_features) > 0:
        ratio_features = len(features) / len(target_features)
    if len(target_special_diets) > 0:
        ratio_special_diets = len(special_diets) / len(target_special_diets)

    shared_cuisines.append(ratio_cuisines)
    shared_features.append(ratio_features)
    shared_special_diets.append(ratio_special_diets)

def get_score(top_recommendations):
    """Calculate the score for every recommendations based on total weight of the path :
        - Best score is 10, for a total weight of 0 (impossible to get)
        - Worst score is 0 for a total weight of 10 or more (minimum 5 edges in the path to get 10 because weight is between 0 and 2)."""
    scores = np.zeros(len(top_recommendations))
    for i in range(len(top_recommendations)):
        score = top_recommendations[i][1]
        if score > 10:
            scores[i] = 0
        else:
            scores[i] = 10 - top_recommendations[i][1]
    return scores

scores = get_score(top_recommendations)

# Print results

print(f"Recommandation for restaurant {target_node}, {G.nodes[target_node]['name']} in {G.nodes[target_node]['city']} :")

if top_k == 0:
    print("No recommendations for this restaurant.")
else:
    if top_k != nb_recommandations:
        print(f"Only {len(path_weight)} restaurants recommanded")
    print("Recommendation scores:")
    for i in range(len(top_recommendations)):
        node, _ = top_recommendations[i]
        score = scores[i]
        print(f"Restaurant {node}, {G.nodes[node]['name']} in {G.nodes[node]['city']}: {Fore.RED}Score={score:.4f}{Style.RESET_ALL}, {Fore.YELLOW}Shared cuisine ratio={shared_cuisines.pop(0):.2f}{Style.RESET_ALL}, {Fore.GREEN}Shared features ratio={shared_features.pop(0):.2f}{Style.RESET_ALL}, {Fore.CYAN}Shared special diets ratio={shared_special_diets.pop(0):.2f}{Style.RESET_ALL}")


#### 14: Semantic Similarity Analysis Using Keywords

In [None]:
# TF-IDF and similarity cosinus
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df_keyword_subset['keywords_str'])
cos_sim_matrix = cosine_similarity(tfidf_matrix)

# Network construction
similarity_threshold = 0.4
G_semantic = nx.Graph()

# Adding nodes
for idx in range(len(df_keyword_subset)):
    G_semantic.add_node(idx, name=df_keyword_subset.iloc[idx]['restaurant_link'])

# Adding edges
for i in range(len(df_keyword_subset)):
    for j in range(i + 1, len(df_keyword_subset)):
        sim = cos_sim_matrix[i, j]
        if sim > similarity_threshold:
            G_semantic.add_edge(i, j, weight=sim)

In [None]:
# Analyze
print(f"Number of nodes : {G_semantic.number_of_nodes()}")
print(f"Number of edges : {G_semantic.number_of_edges()}")
print(f"Density : {nx.density(G_semantic):.4f}")
print(f"Average clustering coefficient : {nx.average_clustering(G_semantic):.4f}")

In [None]:
# Community detection

partition = community_louvain.best_partition(G_semantic)
nx.set_node_attributes(G_semantic, partition, "community")

comm_counter = Counter(partition.values())
print(f"Number of communities (desc similarity): {len(comm_counter)}")
print(f"Top 5 largest communities: {comm_counter.most_common(5)}")

In [None]:
# Vizualisation

community_ids = [G_semantic.nodes[n]["community"] for n in G_semantic.nodes()]
num_comms = max(community_ids) + 1
cmap = plt.cm.get_cmap('nipy_spectral', num_comms)
pos = nx.spring_layout(G_semantic)

plt.figure(figsize=(24, 20))
nodes = nx.draw_networkx_nodes(
    G_semantic,
    pos,
    node_color=community_ids,
    cmap=cmap,
    node_size=30
)

nx.draw_networkx_edges(G_semantic, pos, alpha=0.1, width=1)

plt.title("Communities colored by Louvain ID")
plt.axis("off")

cbar = plt.colorbar(nodes, ticks=range(num_comms))
cbar.set_label("Community ID")

plt.show()

In [None]:
# Choose number of top community to print and number of keywords

nb_keywords = 5
nb_comm = 5

# define the function

def summarize_community_keywords(G, community_id, top_n=nb_keywords):
    # Summarize top [top_n] community keywords for the community [community_id] defined for G and print it
    nodes_in_community = [n for n, d in G.nodes(data=True) if d["community"] == community_id]
    keywords = []

    for n in nodes_in_community:
        k = G.nodes[n].get("keywords")
        if k:
            keywords.extend(k)

    print(f"\nCommunity {community_id} - {len(nodes_in_community)} restaurants")
    print("Top keywords:", Counter(keywords).most_common(top_n))

# Adding keywords information to the network

for idx in G_semantic.nodes:
    G_semantic.nodes[idx]["keywords"] = df_keyword_subset.iloc[idx]["keywords"]

top = [comm for comm, _ in comm_counter.most_common(nb_comm)]
for comm_id in top:
    summarize_community_keywords(G_semantic, comm_id)

#### 15. Multi-Attribute Role Classification

In [None]:
# Define thresholds
high_rating_thresh = 4.5
high_reviews_thresh = df['total_reviews_count'].quantile(0.9)

In [None]:
# Function to classify role
def classify_role(row):
    rating = row['avg_rating']
    awards = bool(row['awards']) if not pd.isnull(row['awards']) else False
    reviews = row['total_reviews_count']
    veg = row.get('vegetarian_friendly', False)
    vegan = row.get('vegan_options', False)
    gluten = row.get('gluten_free', False)
    dietary = veg and vegan and gluten

    if rating >= high_rating_thresh and awards and reviews >= high_reviews_thresh:
        return 'premium'
    elif dietary and not (awards or rating >= high_rating_thresh):
        return 'specialist'
    elif reviews >= high_reviews_thresh:
        return 'popular hub'
    else:
        return 'standard'


In [None]:
# Apply classification
df_sample['role'] = df_sample.apply(classify_role, axis=1)

# Assign role to each node
nx.set_node_attributes(G, df_sample['role'].to_dict(), 'role')

In [None]:
# Visualization with color mapping by role
role_colors = {
    'premium': 'gold',
    'specialist': 'green',
    'popular hub': 'blue',
    'standard': 'gray'
}

colors = [role_colors.get(G.nodes[n].get('role', 'standard')) for n in G.nodes]
node_sizes = [df_sample.loc[n, "total_reviews_count"] if n in df_sample.index else 1 for n in G.nodes]
edge_widths  = [(2 - G[u][v].get("weight", 1)) / 5 for u, v in G.edges]
pos = nx.spring_layout(G, seed=42, k=1)

plt.figure(figsize=(100, 100))
nx.draw_networkx_nodes(G, pos, node_color=colors, node_size=node_sizes, cmap=cmap, alpha=0.8)
#nx.draw_networkx_edges(G, pos, width=edge_widths , alpha=0.4) #better to don't plot edges for visibility
patches = [mpatches.Patch(color=color, label=role) for role, color in role_colors.items()]
plt.legend(handles=patches, loc='upper right')
plt.title("Network Visualization by Restaurant Role with (color = role / size = reviews / edge width = weight)")
plt.axis('off')
plt.show()

In [None]:
# Visualization with color mapping by role
role_colors = {
    'premium': 'gold',
    'specialist': 'green',
    'popular hub': 'blue',
    'standard': 'gray'
}

colors = [role_colors.get(G.nodes[n].get('role', 'standard')) for n in G.nodes]
pos = nx.spring_layout(G, seed=42, k=0.8)

plt.figure(figsize=(20, 20))
nx.draw_networkx_nodes(G, pos, node_color=colors, node_size=30, cmap=cmap, alpha=0.8)
nx.draw_networkx_edges(G, pos, width=0.1, alpha=0.4) #better to don't plot edges for visibility
patches = [mpatches.Patch(color=color, label=role) for role, color in role_colors.items()]
plt.legend(handles=patches, loc='upper right')
plt.title("Network Visualization by Restaurant Role with color = role")
plt.axis('off')
plt.show()

In [None]:
# Centrality analysis per role
centrality = nx.degree_centrality(G)
role_centrality = {role: [] for role in role_colors}
for node, cent in centrality.items():
    role = G.nodes[node].get('role')
    if role:
        role_centrality[role].append(cent)

# Print average centrality by role
print("\nAverage centrality by role:")
for role, values in role_centrality.items():
    print(f"{role}: {np.mean(values):.4f}")

In [None]:
# Degree analysis per role
degree = nx.degree(G)
role_degree = {role: [] for role in role_colors}
for node, cent in degree:
    role = G.nodes[node].get('role')
    if role:
        role_degree[role].append(cent)

# Print average Degree by role
print("\nAverage degree by role:")
for role, values in role_degree.items():
    print(f"{role}: {np.mean(values):.4f}")

In [None]:
role_counts = df_sample['role'].value_counts()

print("Number of restaurants by role :")
print(role_counts)


In [None]:
role_communities = {role: [] for role in role_colors}
for node in G.nodes:
    role = G.nodes[node].get('role')
    community = G.nodes[node].get('community')
    if role in role_communities and community is not None:
        role_communities[role].append(community)

print("\nAverage community inclusion by role:")
for role, communities in role_communities.items():
    avg_comm = np.mean(communities) if communities else 0
    print(f"{role}: {avg_comm:.2f}")