In [1]:
import pandas as pd

In [11]:
df = pd.read_csv('/content/question_answers_final.csv')

In [12]:
df.head()

Unnamed: 0.1,Unnamed: 0,business_id,name,question,question_category,answer,context
0,0,JGVxKGvQEaMZ6O_2l0UucA,Adelphia Restaurant & Events,How is the food at Adelphia Restaurant & Events?,Food Quality,"Based on the reviews, the food at Adelphia Res...","Review 1 (2012-02-16 19:23:50, Very Positive):..."
1,1,JGVxKGvQEaMZ6O_2l0UucA,Adelphia Restaurant & Events,Is the food tasty at Adelphia Restaurant & Eve...,Food Quality,"Yes, according to the reviews, the food at Ade...","Review 1 (2014-08-25 19:46:11, Positive):\nAde..."
2,2,JGVxKGvQEaMZ6O_2l0UucA,Adelphia Restaurant & Events,How are the portions at Adelphia Restaurant & ...,Food Quality,The reviews mention that the portions at Adelp...,"Review 1 (2012-02-16 19:23:50, Very Positive):..."
3,3,JGVxKGvQEaMZ6O_2l0UucA,Adelphia Restaurant & Events,Is the food fresh at Adelphia Restaurant & Eve...,Food Quality,"Yes, according to the reviews, the food at Ade...","Review 1 (2017-09-17 18:23:23, Very Positive):..."
4,4,JGVxKGvQEaMZ6O_2l0UucA,Adelphia Restaurant & Events,How is the menu at Adelphia Restaurant & Events?,Food Quality,The menu at Adelphia Restaurant & Events is de...,"Review 1 (2012-02-16 19:23:50, Very Positive):..."


In [14]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

def analyze_restaurant_questions(df):
    new_category_counts = df['question_category'].value_counts()

    questions_by_restaurant_category = pd.crosstab(
        df['business_id'],
        df['question_category']
    )

    restaurant_counts = df.groupby('business_id').size()

    fig1 = px.bar(
        x=new_category_counts.index,
        y=new_category_counts.values,
        title='Distribution of Questions by Category',
        labels={'x': 'Category', 'y': 'Number of Questions'},
        color=new_category_counts.values,
        color_continuous_scale='Viridis'
    )
    fig1.update_layout(
        xaxis_tickangle=-45,
        showlegend=False,
        height=600
    )

    fig2 = px.pie(
        values=new_category_counts.values,
        names=new_category_counts.index,
        title='Proportion of Questions by Category'
    )

    heatmap = px.imshow(
        questions_by_restaurant_category,
        title='Questions Distribution: Restaurants vs Categories',
        aspect='auto',
        color_continuous_scale='Viridis'
    )
    heatmap.update_layout(
        xaxis_tickangle=-45,
        height=800
    )

    fig4 = px.bar(
        x=restaurant_counts.index,
        y=restaurant_counts.values,
        title='Number of Questions per Restaurant',
        labels={'x': 'Restaurant', 'y': 'Number of Questions'},
        color=restaurant_counts.values,
        color_continuous_scale='Viridis'
    )
    fig4.update_layout(
        xaxis_tickangle=-45,
        showlegend=False,
        height=600
    )

    return new_category_counts, questions_by_restaurant_category, restaurant_counts, fig1, fig2, heatmap, fig4

In [15]:
category_counts, rest_cat_matrix, rest_counts, fig1, fig2, hmap, fig4 = analyze_restaurant_questions(df)

fig1.show()
fig2.show()
hmap.show()
fig4.show()

Category Counts:
question_category
Service                         3071
Popular Dishes                  2434
Special Features/Events         2026
Food Quality                    1560
Ambiance                        1265
Special Dietary Requirements    1229
Price                           1155
Location/Accessibility          1021
Reservation Process              812
Wait Times                       772
Name: count, dtype: int64

Restaurant Category Matrix:
question_category       Ambiance  Food Quality  Location/Accessibility  \
business_id                                                              
-T5SfUI_QqgBrkq4D8Jn7w        32            31                      21   
-pyobEWdpZ6jUp42f3I2xw        20            20                      20   
-qk__vjJxfJrCWn3UaMv_Q        25            47                      21   
1hPvAr6D7yPpGbD_4RMqmQ        25            29                      21   
47Xuiwsv_YsJIm7Px_mNWQ        32            29                      22   
4WJqHhAq_CbtpzNiUuE9Eg

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_list = []
val_list = []
test_list = []

unique_restaurants = df['business_id'].unique()

for biz_id in unique_restaurants:
    biz_df = df[df['business_id'] == biz_id].copy()
    categories = biz_df['question_category']

    try:
        train_df, temp_df = train_test_split(
            biz_df,
            test_size=0.3,
            stratify=categories,
            random_state=42
        )

        val_df, test_df = train_test_split(
            temp_df,
            test_size=2/3,
            stratify=temp_df['question_category'],
            random_state=42
        )

    except ValueError:
        train_df, temp_df = train_test_split(
            biz_df,
            test_size=0.3,
            random_state=42
        )
        val_df, test_df = train_test_split(
            temp_df,
            test_size=2/3,
            random_state=42
        )

    train_list.append(train_df)
    val_list.append(val_df)
    test_list.append(test_df)

train_final = pd.concat(train_list, axis=0, ignore_index=True)
val_final = pd.concat(val_list, axis=0, ignore_index=True)
test_final = pd.concat(test_list, axis=0, ignore_index=True)

print("Train size:", len(train_final))
print("Validation size:", len(val_final))
print("Test size:", len(test_final))
print("Total size:", len(train_final) + len(val_final) + len(test_final))

print("Train category distribution:")
print(train_final['question_category'].value_counts(normalize=True))

print("Val category distribution:")
print(val_final['question_category'].value_counts(normalize=True))

print("Test category distribution:")
print(test_final['question_category'].value_counts(normalize=True))

In [None]:
print("Overall distribution by restaurant:")
print(df['business_id'].value_counts())

print("\nTrain set distribution by restaurant:")
print(train_final['business_id'].value_counts())

print("\nValidation set distribution by restaurant:")
print(val_final['business_id'].value_counts())

print("\nTest set distribution by restaurant:")
print(test_final['business_id'].value_counts())

In [None]:
print("Train distribution by restaurant and category:")
print(train_final.groupby(['business_id', 'question_category']).size())

print("\nValidation distribution by restaurant and category:")
print(val_final.groupby(['business_id', 'question_category']).size())

print("\nTest distribution by restaurant and category:")
print(test_final.groupby(['business_id', 'question_category']).size())

In [None]:
train_final.to_csv("train_final.csv", index=False)
val_final.to_csv("val_final.csv", index=False)
test_final.to_csv("test_final.csv", index=False)