In [None]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import html
from bs4 import BeautifulSoup
from wordcloud import WordCloud
from collections import Counter

In [None]:
#Preprocess the json file and create a new dataframe 
meta_data = []
for rec in open('metadata_Grocery_and_Gourmet_Food.json'):
    meta_data.append(json.loads(rec.strip()))
meta_df = pd.DataFrame.from_dict(meta_data)

In [None]:
#Visualize a snippet of the data
meta_df.head()

In [None]:
# Since we are interested in only specific attributes, we shall extract those
df = meta_df[['asin', 'title', 'description', 'main_cat', 'price']].copy()


# Display the first few rows of the new DataFrame
df.head()

In [None]:
#Remove square brackets from description
df['description'] = df['description'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else x)


In [None]:
#Visualize a snippet of the data
df.head()

In [None]:
#drop non unique asin (Amazon unique identification format)
sample_df = sample_df.drop_duplicates(subset='asin', keep=False)

In [None]:
len(df)

In [None]:
#Working with a sample of the dataset to reduce computational overhead
sample_df = df.sample(frac=0.1) 
len(sample_df)

In [None]:
#Converting html characters
sample_df['description'] = sample_df['description'].apply(lambda x: html.unescape(x))
sample_df['title'] = sample_df['title'].apply(lambda x: html.unescape(x))


In [None]:
#Removing Html tags
sample_df['description'] = sample_df['description'].apply(lambda x: BeautifulSoup(str(x), 'html.parser').get_text() if pd.notnull(x) else '')
sample_df['title'] = sample_df['title'].apply(lambda x: BeautifulSoup(str(x), 'html.parser').get_text() if pd.notnull(x) else '')

In [None]:
#Removing special characters
sample_df['title'] = sample_df['title'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', str(x)) if pd.notnull(x) else '')
sample_df['description'] = sample_df['description'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', str(x)) if pd.notnull(x) else '')

In [None]:
#Dropping records with missing data, as data imputation here would introduce bias
df.replace('', np.nan, inplace=True)
df.dropna(inplace=True)
len(df) 

In [None]:
#Wordcloud 
text = ' '.join(sample_df['description'].astype(str))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Descriptions')
plt.show()

In [None]:
#Most common words in tittle
# Convert both strings and lists in 'description' column to strings
sample_df['title'] = sample_df['title'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

# Remove non-alphabetic characters
sample_df['title'] = sample_df['title'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

# Tokenize words and flatten the list
word_frequency = Counter([word for sublist in sample_df['title'].str.split() for word in sublist])

# Print the top 10 most common words
common_words = word_frequency.most_common(10)
print('Top 10 Most Common Words:')
for word, count in common_words:
    print(f'{word}: {count}')

In [None]:
#Most coomon words in description
# Convert both strings and lists in 'description' column to strings
sample_df['description'] = sample_df['description'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

# Remove non-alphabetic characters
sample_df['description'] = sample_df['description'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

# Tokenize words and flatten the list
word_frequency = Counter([word for sublist in sample_df['description'].str.split() for word in sublist])

# Print the top 10 most common words
common_words = word_frequency.most_common(10)
print('Top 10 Most Common Words:')
for word, count in common_words:
    print(f'{word}: {count}')

In [None]:
sample_df.describe()

In [None]:
sample_df.info()

In [None]:
sample_df.to_csv("food.csv")