**Summmary:**

In [22]:
import pandas as pd
import gzip
import json
import numpy as np


def parse(path):
  g = gzip.open('data/' + path, 'r')
  for l in g:
    yield json.loads(l)



wyoming_reviews = pd.DataFrame(parse("review-Wyoming.json.gz"))
wyoming_metadata = pd.DataFrame(parse("meta-Wyoming.json.gz"))

In [18]:
def parse_first_n(path, n=10000):
    g = gzip.open('data/' + path, 'r')
    for i, l in enumerate(g):
        if i >= n:
            break
        yield json.loads(l)

We dont have rating text here or the geomap so we will need to transform the JSON to get some of that but it looks like the **GOLD** is in the meta data from the JSON file since this is the **ONLY CSV**

The GMAP_ID is the ID of the business itself so we could join tables pretty easy if we needed to on that but would need to **Group by the business** on some bound so we dont repeat the business and increase the size of the table.

Did the state column load in properly?

In [2]:
import gzip
import json
import pandas as pd
import numpy as np
from pathlib import Path

# ----------------------------------------------------
# 1) Loader for .json.gzp (gzip JSON Lines)
# ----------------------------------------------------
def load_json_gzp(path: str) -> pd.DataFrame:
    """
    Loads gzip-compressed JSON Lines:
      {"a":1}\n{"a":2}\n...
    """
    records = []
    with gzip.open(path, "rt", encoding="utf-8") as f:
        for line in f:
            records.append(json.loads(line))
    return pd.DataFrame(records)

# ----------------------------------------------------
# 2) Create 8 geographic regions (2 lat × 4 lon)
# ----------------------------------------------------
def add_va_regions(
    df: pd.DataFrame,
    lat_col="latitude",
    lon_col="longitude",
    n_lat_bins=2,
    n_lon_bins=4,
    method="equal_width",  # "equal_width" or "quantile"
):
    d = df.copy()

    # Clean coordinates
    d = d.dropna(subset=[lat_col, lon_col])
    d[lat_col] = pd.to_numeric(d[lat_col], errors="coerce")
    d[lon_col] = pd.to_numeric(d[lon_col], errors="coerce")
    d = d.dropna(subset=[lat_col, lon_col])

    # Bin lat/lon
    if method == "quantile":
        d["lat_bin"] = pd.qcut(d[lat_col], q=n_lat_bins, labels=False, duplicates="drop")
        d["lon_bin"] = pd.qcut(d[lon_col], q=n_lon_bins, labels=False, duplicates="drop")
    else:
        lat_edges = np.linspace(d[lat_col].min(), d[lat_col].max(), n_lat_bins + 1)
        lon_edges = np.linspace(d[lon_col].min(), d[lon_col].max(), n_lon_bins + 1)
        d["lat_bin"] = pd.cut(d[lat_col], lat_edges, labels=False, include_lowest=True)
        d["lon_bin"] = pd.cut(d[lon_col], lon_edges, labels=False, include_lowest=True)

    d["lat_bin"] = d["lat_bin"].astype("Int64")
    d["lon_bin"] = d["lon_bin"].astype("Int64")

    # Region id: 0–7
    d["region_id"] = (d["lat_bin"] * n_lon_bins + d["lon_bin"]).astype("Int64")

    # Human-readable names
    lon_names = ["West", "MidWest", "MidEast", "East"]
    d["region_name"] = d.apply(
        lambda r: (
            f"{'South' if r.lat_bin == 0 else 'North'}-"
            f"{lon_names[int(r.lon_bin)]}"
        )
        if pd.notna(r.region_id)
        else pd.NA,
        axis=1,
    )

    return d

# ----------------------------------------------------
# 3) Load Virginia datasets
# ----------------------------------------------------
meta_path = "data/meta-Virginia.json.gz"
review_path = "data/review-Virginia.json.gz"

df_meta = load_json_gzp(meta_path)
df_review = load_json_gzp(review_path)

# ----------------------------------------------------
# 4) Assign regions using meta, then attach to reviews
# ----------------------------------------------------
df_meta = add_va_regions(df_meta, method="equal_width")

region_lookup = (
    df_meta[["gmap_id", "region_id", "region_name"]]
    .drop_duplicates("gmap_id")
)

df_review = df_review.merge(region_lookup, on="gmap_id", how="left")

# ----------------------------------------------------
# 5) Example analyses (review culture by region)
# ----------------------------------------------------
# BUSINESS-LEVEL
business_summary = (
    df_meta.groupby(["region_id", "region_name"])
    .agg(
        n_businesses=("gmap_id", "nunique"),
        avg_rating=("avg_rating", "mean"),
        median_rating=("avg_rating", "median"),
        avg_num_reviews=("num_of_reviews", "mean"),
    )
    .reset_index()
    .sort_values("region_id")
)

# REVIEW-LEVEL
df_review["review_length"] = (
    df_review["text"].fillna("").astype(str).str.len()
    if "text" in df_review.columns else np.nan
)

review_summary = (
    df_review.groupby(["region_id", "region_name"])
    .agg(
        n_reviews=("gmap_id", "size"),
        avg_review_rating=("rating", "mean") if "rating" in df_review.columns else ("gmap_id", "size"),
        avg_review_length=("review_length", "mean"),
    )
    .reset_index()
    .sort_values("region_id")
)

print("Business summary by region:")
print(business_summary)

print("\nReview summary by region:")
print(review_summary)



Business summary by region:
   region_id region_name  n_businesses  avg_rating  median_rating  \
0          0  South-West             3    4.366667            4.2   
1          3  South-East            11    4.154545            4.1   
2          4  North-West        119017    4.273322            4.4   

   avg_num_reviews  
0        58.333333  
1       183.181818  
2       133.343484  

Review summary by region:
   region_id region_name  n_reviews  avg_review_rating  avg_review_length
0          0  South-West        175           4.062857          27.057143
1          3  South-East       2015           3.946898          95.353846
2          4  North-West   15955748           4.264254          99.268954


In [3]:
biz = df_meta[["gmap_id", "avg_rating", "num_of_reviews"]].dropna()

biz["num_of_reviews"] = biz["num_of_reviews"].astype(int)
biz["avg_rating"] = biz["avg_rating"].astype(float)


In [4]:
bins = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, biz["num_of_reviews"].max() + 1]
labels = [f"{bins[i]}–{bins[i+1]-1}" for i in range(len(bins)-1)]

biz["review_bin"] = pd.cut(
    biz["num_of_reviews"],
    bins=bins,
    labels=labels,
    right=False
)

stability_table = (
    biz.groupby("review_bin")
    .agg(
        n_businesses=("gmap_id", "count"),
        rating_std=("avg_rating", "std"),
        rating_var=("avg_rating", "var"),
        mean_rating=("avg_rating", "mean"),
    )
    .reset_index()
)

print(stability_table)

  review_bin  n_businesses  rating_std  rating_var  mean_rating
0        1–1          4228    1.223072    1.495906     4.271949
1        2–4         10213    0.875691    0.766835     4.282415
2        5–9         19405    0.722017    0.521309     4.263679
3      10–19         15062    0.661144    0.437111     4.256619
4      20–49         21531    0.620459    0.384970     4.268069
5      50–99         16440    0.567684    0.322265     4.286089
6    100–199         12988    0.492077    0.242140     4.310371
7    200–499         12316    0.451288    0.203661     4.282941
8    500–999          5013    0.410508    0.168517     4.214981
9  1000–9998          2477    0.381416    0.145479     4.251756


  biz.groupby("review_bin")


In [5]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(stability_table["review_bin"], stability_table["rating_std"], marker="o")
plt.xticks(rotation=45)
plt.ylabel("Std Dev of Avg Rating")
plt.xlabel("Number of Reviews (binned)")
plt.title("Rating Volatility vs Review Count")
plt.grid(True)
plt.tight_layout()
plt.show()

ModuleNotFoundError: No module named 'matplotlib'

In [14]:
pip install matplotlib scikit-learn

Collecting matplotlibNote: you may need to restart the kernel to use updated packages.

  Using cached matplotlib-3.10.8-cp310-cp310-win_amd64.whl.metadata (52 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp310-cp310-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.2-cp310-cp310-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Using cached matplotlib-3.10.8-cp310-cp310-win_amd64.whl (8.1 MB)
Using cached scikit_learn-1.7.2-cp310-cp310-win_amd64.whl (8.9 MB)
Using cached contourpy-1.3.2-cp310-cp310-win_amd64.whl (221 kB)
Using cached cycler-0.12.1-py3-none-any.whl (8.3 kB)
Installing collected packages: cycler, contourpy, scikit-learn, matplotlib

   -------------------- ------------------- 2/4 [scikit-learn]
   -------------------- ------------------- 2/4 [scikit-learn]
   -------------------- ------------------- 2/4 [scikit-


[notice] A new release of pip is available: 25.2 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting matplotlib
  Downloading matplotlib-3.10.8-cp310-cp310-win_amd64.whl.metadata (52 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp310-cp310-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.2-cp310-cp310-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.61.1-cp310-cp310-win_amd64.whl.metadata (116 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.9-cp310-cp310-win_amd64.whl.metadata (6.4 kB)
Collecting pillow>=8 (from matplotlib)
  Downloading pillow-12.1.0-cp310-cp310-win_amd64.whl.metadata (9.0 kB)
Collecting pyparsing>=3 (from matplotlib)
  Downloading pyparsing-3.3.2-py3-none-any.whl.metadata (5.8 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.15.3-cp310-cp310-win_amd64.whl.metadata (60 kB)
Co


[notice] A new release of pip is available: 25.2 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
stability_table["delta_std"] = stability_table["rating_std"].diff().abs()
stability_table

epsilon = 0.02  # ratings barely change beyond this
stable_bins = stability_table[stability_table["delta_std"] < epsilon]
print(stable_bins.head())




Empty DataFrame
Columns: [review_bin, n_businesses, rating_std, rating_var, mean_rating, delta_std]
Index: []


In [7]:
biz_stable = biz[biz["num_of_reviews"] >= 40]

In [8]:
biz["weight"] = np.log1p(biz["num_of_reviews"])


In [9]:
def add_geo_cells(df, lat_col="latitude", lon_col="longitude", cell_size=0.1):
    """
    cell_size ≈ 0.1 degrees ≈ 11km latitude
    """
    d = df.copy()
    d["lat_cell"] = (d[lat_col] // cell_size) * cell_size
    d["lon_cell"] = (d[lon_col] // cell_size) * cell_size
    d["geo_cell"] = d["lat_cell"].astype(str) + "_" + d["lon_cell"].astype(str)
    return d

df_meta_cells = add_geo_cells(df_meta)


In [10]:
cell_features = (
    df_meta_cells.groupby("geo_cell")
    .agg(
        n_businesses=("gmap_id", "nunique"),
        avg_reviews_per_biz=("num_of_reviews", "mean"),
        median_reviews_per_biz=("num_of_reviews", "median"),
        pct_high_review_biz=("num_of_reviews", lambda x: (x >= 100).mean()),
        avg_rating=("avg_rating", "mean"),
    )
    .reset_index()
)


In [11]:
if "categories" in df_meta_cells.columns:
    cat_div = (
        df_meta_cells.explode("categories")
        .groupby("geo_cell")["categories"]
        .nunique()
        .rename("category_diversity")
    )
    cell_features = cell_features.merge(cat_div, on="geo_cell", how="left")


In [15]:
from sklearn.preprocessing import StandardScaler

features = [
    "n_businesses",
    "avg_reviews_per_biz",
    "pct_high_review_biz",
]

if "category_diversity" in cell_features.columns:
    features.append("category_diversity")

X = cell_features[features].fillna(0)

scaler = StandardScaler()
cell_features["urban_index"] = scaler.fit_transform(X).mean(axis=1)



In [16]:
cell_features["urban_rural"] = pd.qcut(
    cell_features["urban_index"],
    q=2,
    labels=["rural", "urban"]
)



In [17]:
df_meta_cells = df_meta_cells.merge(
    cell_features[["geo_cell", "urban_rural", "urban_index"]],
    on="geo_cell",
    how="left"
)

df_review = df_review.merge(
    df_meta_cells[["gmap_id", "urban_rural", "urban_index"]],
    on="gmap_id",
    how="left"
)


In [18]:
df_meta_cells.groupby("urban_rural").agg(
    avg_rating=("avg_rating", "mean"),
    median_reviews=("num_of_reviews", "median"),
    pct_5star=("avg_rating", lambda x: (x >= 4.5).mean()),
)


  df_meta_cells.groupby("urban_rural").agg(


Unnamed: 0_level_0,avg_rating,median_reviews,pct_5star
urban_rural,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rural,4.489671,8.0,0.65384
urban,4.262043,38.0,0.462294


In [19]:
df_review["review_len"] = df_review["text"].fillna("").str.len()

df_review.groupby("urban_rural").agg(
    avg_review_rating=("rating", "mean"),
    avg_review_length=("review_len", "mean"),
    n_reviews=("gmap_id", "size"),
)


  df_review.groupby("urban_rural").agg(


Unnamed: 0_level_0,avg_review_rating,avg_review_length,n_reviews
urban_rural,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rural,4.489554,98.592364,165086
urban,4.262017,99.413384,15816382
