In [None]:
import pandas as pd
import json

# Business

In [None]:
# read in the first 1000 lines of the json file

records = []
with open("./data/yelp_academic_dataset_business.json", "r") as f:
    for i, line in enumerate(f):
        if i >= 1000:  # Change this number to get more/less
            break
        try:
            record = json.loads(line)
            records.append(record)
        except json.JSONDecodeError:
            print(f"Skipping bad line {i}")

df_business = pd.DataFrame(records)

In [None]:
df_business.head()


In [None]:
df_business.info()

## Categories in Business file

In [None]:
col_series = df_business.iloc[:, 12]
split_lists = col_series.dropna().apply(lambda x: [item.strip() for item in x.split(',')])
all_elements = [item for sublist in split_lists for item in sublist]
unique_elements = set(all_elements)
print("Unique elements:", unique_elements)
print("Total unique elements:", len(unique_elements))

In [None]:
def elements_count(my_data_local, col_index_local, sep_local=","):
    element_count_dict = {}

    # go through each entry (row) in the dataset
    for _, row in my_data_local.iterrows():
        # take the value from the given column index
        value = row.iloc[col_index_local]
        
        # check if the value is a string (avoid errors with NaN/None)
        if isinstance(value, str):
            # split the string by the separator
            new_elements = value.split(sep_local)
            
            # iterate over the separated values
            for i in new_elements:
                i = i.strip()  # clean up whitespace
                # check if the string is already in the dictionary - if not, set it as key with value 1
                if i not in element_count_dict:
                    element_count_dict[i] = 1
                # if we already have this key, increase the value by 1
                else:
                    element_count_dict[i] += 1

    # sort the dictionary based on values, descending
    sorted_elements_count = dict(sorted(element_count_dict.items(), key=lambda item: item[1], reverse=True))
    return sorted_elements_count

In [None]:
elements_count(df_business, 12)

Filtering by category:

In [None]:
filtered_df = df_business[df_business['categories'].str.contains('Bars', na=False)]
filtered_df

# Reviews

In [None]:
records = []
with open("./data/yelp_academic_dataset_review.json", "r") as f:
    for i, line in enumerate(f):
        if i >= 1000:  # Change this number to get more/less
            break
        try:
            record = json.loads(line)
            records.append(record)
        except json.JSONDecodeError:
            print(f"Skipping bad line {i}")

df_reviews = pd.DataFrame(records)

In [None]:
df_reviews.head()

In [None]:
# read in the lines only from the year...
records = []
with open("./data/yelp_academic_dataset_review.json", "r") as f:
    for i, line in enumerate(f):
        try:
            record = json.loads(line)
            year_field = record.get("date")
            if year_field and "2020" in str(year_field):
                records.append(record)
        except json.JSONDecodeError:
            print(f"Skipping bad line {i}")

df_review2020 = pd.DataFrame(records)

In [None]:
df_review2020.shape

## Checking how much data is for year 2022
Last year that contains data

In [None]:
df_review2022['date'] = pd.to_datetime(df_review2022['date'])

In [None]:
df_review2022.sort_values(['date'])


In [None]:
df_review2019.shape

# Check-ins

In [None]:
records = []
with open("./data/yelp_academic_dataset_checkin.json", "r") as f:
    for i, line in enumerate(f):
        if i >= 1000:  # Change this number to get more/less
            break
        try:
            record = json.loads(line)
            records.append(record)
        except json.JSONDecodeError:
            print(f"Skipping bad line {i}")

df_checkin = pd.DataFrame(records)

In [None]:
df_checkin.head()

In [None]:
# read in the lines only from the year...
records = []
with open("./data/yelp_academic_dataset_checkin.json", "r") as f:
    for i, line in enumerate(f):
        try:
            record = json.loads(line)
            year_field = record.get("date")
            if year_field and "2023" in str(year_field):
                records.append(record)
        except json.JSONDecodeError:
            print(f"Skipping bad line {i}")

df_checkin2023 = pd.DataFrame(records)

In [None]:
df_checkin2023

# User

In [None]:
records = []
with open("./data/yelp_academic_dataset_user.json", "r") as f:
    for i, line in enumerate(f):
        if i >= 1000:  # Change this number to get more/less
            break
        try:
            record = json.loads(line)
            records.append(record)
        except json.JSONDecodeError:
            print(f"Skipping bad line {i}")

df_user = pd.DataFrame(records)

In [None]:
df_user.head()

In [None]:
df_user.info()

In [None]:
print(df_user.columns.tolist())

In [None]:
# read in the lines only from the year...
records = []
with open("./data/yelp_academic_dataset_user.json", "r") as f:
    for i, line in enumerate(f):
        try:
            record = json.loads(line)
            year_field = record.get("yelping_since")
            if year_field and "2023" in str(year_field):
                records.append(record)
        except json.JSONDecodeError:
            print(f"Skipping bad line {i}")

df_user_since = pd.DataFrame(records)

In [None]:
df_user_since.head()