In [1]:
import os
import json
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from scripts import *
import plotly.io as pio
import plotly.express as px
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 200)
%matplotlib inline

In [2]:
data_dir = os.path.abspath("../data")

### Source Files
Scraped FB comments were divided into chunks during LangChain preprocessing due to connection/timeout issues

In [3]:
fb0 = f'{data_dir}/sentiment_analysis_fbcomments-sublist-0.csv'
fb1 = f'{data_dir}/sentiment_analysis_fbcomments-sublist-1.csv'
fb2 = f'{data_dir}/sentiment_analysis_fbcomments-sublist-2.csv'
fb3 = f'{data_dir}/sentiment_analysis_fbcomments-sublist-3.csv'
fb4 = f'{data_dir}/sentiment_analysis_fbcomments-sublist-4.csv'
fb5 = f'{data_dir}/sentiment_analysis_fbcomments-sublist-5.csv'
fb6 = f'{data_dir}/sentiment_analysis_fbcomments-sublist-6.csv'
fb7 = f'{data_dir}/sentiment_analysis_fbcomments-sublist-7.csv'
fb8 = f'{data_dir}/sentiment_analysis_fbcomments-sublist-8.csv'

fbcomment_cleanfile = f'{data_dir}/fbcomments_cleaned.csv'

### Read to CSVs

In [4]:
csv_files = [fb0, fb1, fb2, fb3, fb4, fb5, fb6, fb7, fb8]

# Create an empty dictionary to store dataframes
dfs = {}

for file_path in csv_files:
    comment_chunk = file_path.split('.')[0] 
    
    # Read the CSV file into a DataFrame and store it in the dictionary
    dfs[comment_chunk] = pd.read_csv(file_path)



In [5]:
# From the dictionary of dataframes, combine all of them into one dataframe

fbcomments = pd.concat(dfs.values(), ignore_index=True)
fbcomments.drop(columns='Unnamed: 0', inplace=True)

In [6]:
fbcomments.shape

(2108, 4)

In [7]:
fbcomments.head()


Unnamed: 0,topic,sentiment,summary,product_rating
0,"['Food', 'Pancit Canton']",positive,The commenter expresses delight in the taste o...,5
1,"['Favorite names for kids', 'LuckyMe']",positive,The commenter's favorite names for their kids ...,5
2,"['Merienda', 'Pancit Canton']",positive,The post is about the user's favorite merienda...,5
3,"['College days', 'Work']",positive,The commenter reminisces about how this produc...,5
4,"['Food', 'LuckyMe']",positive,The commenter enjoys the product and their chi...,5


In [8]:
df = fbcomments[['topic']].copy()

In [9]:
df.head()

Unnamed: 0,topic
0,"['Food', 'Pancit Canton']"
1,"['Favorite names for kids', 'LuckyMe']"
2,"['Merienda', 'Pancit Canton']"
3,"['College days', 'Work']"
4,"['Food', 'LuckyMe']"


In [10]:
df['topic'].value_counts().head(50)

[]                                               837
['Food', 'LuckyMe']                              269
['LuckyMe', 'product quality']                    77
['product quality']                               69
['Product quality']                               25
['food', 'LuckyMe']                               24
['LuckyMe']                                       14
['Food']                                          14
['Unknown']                                       13
['Favorite things']                               11
['Food', 'Taste']                                 11
['LuckyMePancitCanton', 'jomarson']               11
['Food', 'Pancit Canton']                          9
['Favorite food', 'LuckyMe']                       7
['product quality', 'LuckyMe']                     7
['Spicy food', 'LuckyMe']                          7
['LuckyMe', 'food']                                7
['Food', 'Snacks']                                 6
['Cooking', 'LuckyMe']                        

In [11]:
# load cleaned webscraped fbcomments file
og_df = pd.read_csv(fbcomment_cleanfile)
og_df.drop(columns='Unnamed: 0', inplace=True)

In [12]:
og_df.head()

Unnamed: 0,date,text
0,2021-05-30,Wow sarap tlaga bsta pancit canton may fav chi...
1,2021-05-30,Favorite namen ng kids ko yan momsh Lalo ng pa...
2,2021-05-30,Favorite merienda namin to lalo na ang pancit ...
3,2021-05-30,I remember the days na ito yung nagpapa surviv...
4,2021-05-31,Sarap fav.namin ng mga anak ko pero my limit p...


In [13]:
combined_df = pd.concat([og_df,fbcomments], axis=1)
combined_df.head()

Unnamed: 0,date,text,topic,sentiment,summary,product_rating
0,2021-05-30,Wow sarap tlaga bsta pancit canton may fav chi...,"['Food', 'Pancit Canton']",positive,The commenter expresses delight in the taste o...,5
1,2021-05-30,Favorite namen ng kids ko yan momsh Lalo ng pa...,"['Favorite names for kids', 'LuckyMe']",positive,The commenter's favorite names for their kids ...,5
2,2021-05-30,Favorite merienda namin to lalo na ang pancit ...,"['Merienda', 'Pancit Canton']",positive,The post is about the user's favorite merienda...,5
3,2021-05-30,I remember the days na ito yung nagpapa surviv...,"['College days', 'Work']",positive,The commenter reminisces about how this produc...,5
4,2021-05-31,Sarap fav.namin ng mga anak ko pero my limit p...,"['Food', 'LuckyMe']",positive,The commenter enjoys the product and their chi...,5


In [14]:
combined_df.rename(columns={'text':'comment'},inplace=True)

In [15]:
combined_df.head()

Unnamed: 0,date,comment,topic,sentiment,summary,product_rating
0,2021-05-30,Wow sarap tlaga bsta pancit canton may fav chi...,"['Food', 'Pancit Canton']",positive,The commenter expresses delight in the taste o...,5
1,2021-05-30,Favorite namen ng kids ko yan momsh Lalo ng pa...,"['Favorite names for kids', 'LuckyMe']",positive,The commenter's favorite names for their kids ...,5
2,2021-05-30,Favorite merienda namin to lalo na ang pancit ...,"['Merienda', 'Pancit Canton']",positive,The post is about the user's favorite merienda...,5
3,2021-05-30,I remember the days na ito yung nagpapa surviv...,"['College days', 'Work']",positive,The commenter reminisces about how this produc...,5
4,2021-05-31,Sarap fav.namin ng mga anak ko pero my limit p...,"['Food', 'LuckyMe']",positive,The commenter enjoys the product and their chi...,5


In [147]:
# Save to final csv file for further processing
combined_df.to_csv(f'{data_dir}/fbcomments_for_viz.csv',index=False)