In [None]:
import pandas as pd
import numpy as np

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

import seaborn as sns
import matplotlib.pyplot as plt


from dotenv import dotenv_values
from sqlalchemy import create_engine, types
from sqlalchemy.dialects.postgresql import JSON as postgres_json

In [None]:
business_full = pd.read_json("./data/yelp_academic_dataset_business.json", lines=True)

In [None]:
cat = pd.DataFrame(business_full['categories'])

In [None]:
cat.info()

In [None]:
# Step 1: Handle missing values by replacing with empty string
cat['categories'] = cat['categories'].fillna('')

# Step 2: Convert comma-separated strings to cleaned lists
cat['categories'] = cat['categories'].apply(lambda x: [e.strip() for e in x.split(',')])

In [None]:
print(type(cat['categories'].iloc[0]))


In [None]:
print(cat['categories'].iloc[0]) 

In [None]:
cat.info()

In [None]:
cat

In [None]:
# Step 3: One-hot encode the data for mlxtend
te = TransactionEncoder()
te_ary = te.fit(cat['categories']).transform(cat['categories'])
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

In [None]:
print('\nOne-Hot Encoded DataFrame:')
print(df_encoded)

In [None]:
# Step 4: Find frequent itemsets
frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)

print('\nFrequent Itemsets:')
print(frequent_itemsets.sort_values(by='support', ascending=False))


In [None]:
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)

In [None]:
frequent_itemsets.head(50)

In [None]:
df_filtered = cat[cat['categories'].apply(lambda x: len(x) > 1)]

In [None]:
df_filtered.shape

In [None]:
df_filtered

In [None]:
cat.shape

In [None]:
# Step 3: One-hot encode the data for mlxtend
te = TransactionEncoder()
te_ary = te.fit(df_filtered['categories']).transform(df_filtered['categories'])
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

In [None]:
# Step 4: Find frequent itemsets
frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)

print('\nFrequent Itemsets:')
print(frequent_itemsets.sort_values(by='support', ascending=False))

In [None]:
# Keep only itemsets with at least 2 items
frequent_itemsets = frequent_itemsets[frequent_itemsets['itemsets'].apply(lambda x: len(x) > 1)]


In [None]:
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)
frequent_itemsets.head(50)

In [None]:
business_full[business_full['categories'] == 'Home Services']

# Defining key categories

In [None]:
# define key categories to filter on
key_categories = ['Restaurants', 
                  'Coffee & Tea', 
                  'Shopping', 
                  'Automotive', 
                  'Beauty & Spas', 
                  'Health & Medical', 
                  'Hotels & Travel', 
                  'Active Life', 
                  'Arts & Entertainment', 
                  'Home Services', 
                  'Local Services',
                  'Nightlife']
len(key_categories)

## Dataframe containing only key categories

In [None]:
# define the OR pattern, to filter the dataframe if *any* one of the categories is present
pattern = '|'.join(key_categories)

# filter the dataframe
filtered_df = business_full[business_full['categories'].str.contains(pattern, case=False, na=False)]


In [None]:
business_full.shape

In [None]:
filtered_df.shape[0]

In [None]:
# what's the percentage of our full dataset that these categories cover?
filtered_df.shape[0] / business_full.shape[0]

In [None]:
filtered_df.head()

In [None]:
counts = {substring: filtered_df['categories'].str.contains(substring, case=False, na=False).sum() for substring in key_categories}
counts_df = pd.DataFrame.from_dict(counts, orient='index', columns=['count'])
print(counts_df)



In [None]:
total_rows = len(filtered_df)

# Calculate percentages
percentages = {
    substring: (filtered_df['categories'].str.contains(substring, case=False, na=False).sum() / total_rows) * 100
    for substring in key_categories
}

# Display as a DataFrame
percentages_df = pd.DataFrame.from_dict(percentages, orient='index', columns=['percentage'])
percentages_df.sort_values(by='percentage', ascending=False).round(1)


In [None]:
# our percentages add up to more than 100! that means there are rows which contain multiple categories
sum(percentages_df["percentage"])

In [None]:
# Create a boolean DataFrame where each column is True if that substring is found
matches = pd.DataFrame({
    substring: filtered_df['categories'].str.contains(substring, case=False, na=False)
    for substring in key_categories
})

# Count how many substrings matched in each row
match_counts = matches.sum(axis=1)

In [None]:
match_counts

In [None]:
# Count rows where 2 or more substrings matched
rows_with_2_or_more = (match_counts >= 2).sum()
rows_with_2_or_more

In [None]:
# 16.5 % of the data belongs to multiple categories
rows_with_2_or_more / filtered_df.shape[0]

# Dataframe containing only key categories, no overlap

In [None]:
# select the rows which contain only 1 key category
key_cat_df = filtered_df[match_counts == 1]

In [None]:
key_cat_df

In [None]:
# how much data did we filter out along the way? 

# percent of original dataset:

key_cat_df.shape[0] / business_full.shape[0]

In [None]:
# display as percentages
total_rows = len(key_cat_df)

# Calculate percentages
percentages = {
    substring: (key_cat_df['categories'].str.contains(substring, case=False, na=False).sum() / total_rows) * 100
    for substring in key_categories
}

# Display as a DataFrame
percentages_df = pd.DataFrame.from_dict(percentages, orient='index', columns=['percentage'])
percentages_df.sort_values(by='percentage', ascending=False)

In [None]:
sum(percentages_df["percentage"])

## Creating a new column with the key category label

In [None]:
# create a new column that contains the key category label

# Identify the substring that matched for each row
def get_matched_substring(row):
    matched = [substring for substring, is_matched in row.items() if is_matched]
    return matched[0] if len(matched) == 1 else None

# Apply the function across rows
key_cat_df['key_categories'] = matches.apply(get_matched_substring, axis=1)

key_cat_df


# Uploading the new table to SQL database

In [None]:
# getting the DB credentials

config = dotenv_values()
 
pg_user = config['POSTGRES_USER'] # align the key labels with your .env file
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [None]:
# updating the url
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'

# creating the engine
engine = create_engine(url, echo=False)

In [None]:
engine.url # checking the url (password is hidden)

In [None]:
# define the data types

dtype_business = {
    'business_id': types.String,
    'name': types.String,
    'address': types.String,
    'city': types.String,
    'state': types.String,
    'postal_code': types.String,
    'latitude': types.Float,
    'longitude': types.Float,
    'stars': types.Float,
    'review_count': types.Integer,
    'is_open': types.Integer,
    'attributes': types.JSON,
    'categories': types.String,
    'hours': types.JSON,
    'key_categories': types.String
             }



In [None]:
# writing key categories dataframe to the database
key_cat_df.to_sql(name = 'business_key_cat', 
                       con = engine, 
                       schema = pg_schema, # pandas is allowing to specify, in which schema the table shall be created
                       if_exists='replace', 
                       dtype = dtype_business,
                       index=False
                      )

In [None]:
key_cat_df.info()

In [None]:
pd.set_option('display.max_colwidth', 80)

In [None]:
grouped = key_cat_df.groupby('key_categories')['categories'].unique().reset_index()

# Rename the column for clarity
grouped.rename(columns={'categories': 'distinct_categories'}, inplace=True)


In [None]:
grouped

In [None]:
grouped = key_cat_df.groupby('key_categories')['categories'].unique().reset_index()