# Clustering the coworking spaces.

## Merge the description from the JSON file of each city.

In [1]:
import pandas as pd
import json
import os

# List of JSON file paths (you should replace these with the correct paths to your files)
json_files = [
    "/workspaces/Coworking/src/results/Madrid/Madrid_coworking_spaces.json",
    "/workspaces/Coworking/src/results/Barcelona/Barcelona_coworking_spaces.json",
    "/workspaces/Coworking/src/results/New York/New_York_coworking_spaces.json",
    "/workspaces/Coworking/src/results/Tokyo/tokyo_coworking_spaces.json",
    "/workspaces/Coworking/src/results/Sao Paulo/sp_coworking_spaces.json"
]

# List to hold all dataframes
dfs = []

# Load each JSON file and convert it to a DataFrame
for file in json_files:
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        df = pd.DataFrame(data)
        dfs.append(df)

# Concatenate all the dataframes into one
merged_df = pd.concat(dfs, ignore_index=True)

# Optional: You can inspect the first few rows
print(merged_df.head())

# Save merged data to CSV
merged_df.to_csv("merged_coworking_spaces.csv", index=False)

# Save merged data to JSON (optional)
merged_df.to_json("merged_coworking_spaces.json", orient="records", lines=True, force_ascii=False)

print("All JSON files have been merged successfully!")


                                                 url  \
0  https://www.coworker.com/spain/madrid/regus-ma...   
1  https://www.coworker.com/spain/madrid/regus-la...   
2  https://www.coworker.com/spain/madrid/regus-ma...   
3  https://www.coworker.com/spain/madrid/wework-e...   
4  https://www.coworker.com/spain/madrid/regus-ma...   

                                                name address  \
0  Coworking Space: Regus - Madrid Financial Dist...           
1  Coworking Space: Regus - LAS ROZAS, Las Rozas ...           
2  Coworking Space: Regus - Madrid, Ortega y Gass...           
3  Coworking Space: WeWork Eloy Gonzalo 27 in Madrid           
4  Coworking Space: Regus - Madrid Pinar-Salamanc...           

                                         description amenities  \
0  Overview of Regus - Madrid Financial District ...        []   
1  Overview of Regus - LAS ROZAS, Las Rozas\nThe ...        []   
2  Overview of Regus - Madrid, Ortega y Gasset\nT...        []   
3  Overview of

## Using K-Means clustering 

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

open

# Drop rows where price is missing or invalid
merged_df = merged_df.dropna(subset=['price'])

# Handle text in 'description' column using TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=500)
description_tfidf = tfidf.fit_transform(merged_df['description'])

# Normalize the price data
scaler = StandardScaler()
normalized_price = scaler.fit_transform(merged_df[['price']])

# Convert the sparse matrix from TF-IDF to a dense array
description_tfidf = description_tfidf.toarray()

# Combine the description features (TF-IDF) and price features (normalized)
features = pd.concat([pd.DataFrame(description_tfidf), pd.DataFrame(normalized_price)], axis=1)

# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)  # Set the number of clusters (5 is an example)
merged_df['cluster'] = kmeans.fit_predict(features)

# Visualize the clusters (using the first two features for simplicity)
sns.scatterplot(x=features[:, 0], y=features[:, 1], hue=merged_df['cluster'], palette='viridis')
plt.title('K-Means Clustering of Coworking Spaces')
plt.xlabel('Feature 1 (from description TF-IDF)')
plt.ylabel('Feature 2 (from description TF-IDF)')
plt.legend()
plt.show()

# Optional: Inspect the clusters and the coworking spaces in each cluster
for i in range(5):  # Assuming you have 5 clusters
    print(f"\nCluster {i}:")
    cluster_centers = kmeans.cluster_centers_[i]
    print(f"Cluster center: {cluster_centers}")
    
    cluster_coworking_spaces = merged_df[merged_df['cluster'] == i]
    print(cluster_coworking_spaces[['name', 'price', 'address']].head())
