In [1]:
import requests
from bs4 import BeautifulSoup as soup
import pandas as pd
from nltk import FreqDist, sent_tokenize, word_tokenize
import nltk

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/chethanats/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
header = {
    'Origin': 'https://www.1mg.com',
    'Referer': 'https://www.1mg.com/categories/exclusive/immunity-boosters/vitamin-c-734',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}

In [4]:
results = []

In [5]:
for i in range(1, 101):  #100 pages
    page_url = f'https://www.1mg.com/categories/exclusive/immunity-boosters/vitamin-c-734?page={i}'
    response = requests.get(url=page_url, headers=header)

    if response.status_code == 200:
        page_soup = soup(response.content, 'lxml')

        product_descriptions = page_soup.findAll('div', {'class': 'style__product-description___2XaG0'})
        text_content = ' '.join([desc.text for desc in product_descriptions])

        # NLP Statistics
        words = word_tokenize(text_content)
        sentences = sent_tokenize(text_content)
        freq_dist = FreqDist(words)

        stats = {
            'page': i,
            'total_words': len(words),
            'unique_words': len(set(words)),
            'top_5_words': freq_dist.most_common(5),
            'average_word_length': sum(len(word) for word in words) / len(words) if words else 0,
            'total_sentences': len(sentences)
        }

        results.append(stats)
    else:
        print(f"Failed to fetch page {i}")


In [7]:
# Convert results to a DataFrame for easier handling
df_results = pd.DataFrame(results)

# Save to a CSV file
df_results.to_csv('nlp_stats.csv', index=False)

In [9]:
df = pd.read_csv('nlp_stats.csv')
df

Unnamed: 0,page,total_words,unique_words,top_5_words,average_word_length,total_sentences
0,1,823,202,"[('of', 52), ('&', 42), ('|', 38), ('Vitamin',...",4.900365,1
1,2,823,202,"[('of', 52), ('&', 42), ('|', 38), ('Vitamin',...",4.900365,1
2,3,823,202,"[('of', 52), ('&', 42), ('|', 38), ('Vitamin',...",4.900365,1
3,4,823,202,"[('of', 52), ('&', 42), ('|', 38), ('Vitamin',...",4.900365,1
4,5,823,202,"[('of', 52), ('&', 42), ('|', 38), ('Vitamin',...",4.900365,1
...,...,...,...,...,...,...
95,96,823,202,"[('of', 52), ('&', 42), ('|', 38), ('Vitamin',...",4.900365,1
96,97,823,202,"[('of', 52), ('&', 42), ('|', 38), ('Vitamin',...",4.900365,1
97,98,823,202,"[('of', 52), ('&', 42), ('|', 38), ('Vitamin',...",4.900365,1
98,99,823,202,"[('of', 52), ('&', 42), ('|', 38), ('Vitamin',...",4.900365,1


In [10]:
# Calculate averages of the statistics
average_stats = {
    'average_total_words': df_results['total_words'].mean(),
    'average_unique_words': df_results['unique_words'].mean(),
    'average_word_length': df_results['average_word_length'].mean(),
    'average_sentences': df_results['total_sentences'].mean()
}

# Convert the averages to a DataFrame
df_average_results = pd.DataFrame([average_stats])

# Save the averages to a CSV file
df_average_results.to_csv('aggregated_results.csv', index=False)


In [11]:
df2 = pd.read_csv('aggregated_results.csv')
df2

Unnamed: 0,average_total_words,average_unique_words,average_word_length,average_sentences
0,823.0,202.0,4.900365,1.0
