<a href="https://colab.research.google.com/github/Aayush050502/Transformer-Powered-Sales-Chatbots/blob/main/PRODUCT_EVENTS_%26_CHATBOT_TRACKER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

PART 1: DATA SCRAPING


IMPORTING WEB SCRAPING LIBRARIES



In [None]:
!pip install requests beautifulsoup4 pandas



IMPORT GOOGLE DRIVE


In [None]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
from datetime import datetime
import os
import time
from urllib.parse import urljoin
import re

EXTRACTING IMPORTANT INFORMATION ABOUT PRODUCTS FROM THE WEBSITE

In [None]:
class DrRashelScraper:
    def __init__(self, base_url):
        self.base_url = base_url
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)

    def get_soup(self, url):
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            return BeautifulSoup(response.content, 'html.parser')
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            return None

    def extract_product_links(self, soup):
        """Extract all product page links"""
        product_links = []
        product_containers = soup.find_all('a', href=True)
        for link in product_containers:
            href = link['href']
            if 'product' in href.lower():
                full_url = urljoin(self.base_url, href)
                if full_url not in product_links:
                    product_links.append(full_url)
        return product_links

    def extract_price(self, text):
        """Extract price value from text"""
        if text:
            price_match = re.search(r'[\d,]+\.?\d*', text.replace(',', ''))
            if price_match:
                return float(price_match.group())
        return None

    def extract_product_details(self, url):
        """Extract detailed product information"""
        soup = self.get_soup(url)
        if not soup:
            return None

        product = {
            'url': url,
            'timestamp': datetime.now().isoformat()
        }


        name_elem = soup.find(['h1', 'h2'], class_=['product-title', 'product-name'])
        if name_elem:
            product['name'] = name_elem.text.strip()

        price_elem = soup.find(['span', 'div'], class_=['price', 'product-price'])
        if price_elem:
            product['price'] = self.extract_price(price_elem.text)
            product['price_raw'] = price_elem.text.strip()


        sku_elem = soup.find(['span', 'div'], class_=['sku', 'product-sku'])
        if sku_elem:
            product['sku'] = sku_elem.text.strip()


        desc_elem = soup.find(['div', 'p'], class_=['description', 'product-description'])
        if desc_elem:
            product['description'] = desc_elem.text.strip()


        category_elem = soup.find('nav', class_=['breadcrumb', 'woocommerce-breadcrumb'])
        if category_elem:
            product['categories'] = [cat.text.strip() for cat in category_elem.find_all('a')]


        image_elems = soup.find_all('img', class_=['product-image', 'wp-post-image'])
        product['images'] = [urljoin(self.base_url, img['src']) for img in image_elems if 'src' in img.attrs]


        specs = {}
        specs_table = soup.find('table', class_=['specifications', 'product-attributes'])
        if specs_table:
            for row in specs_table.find_all('tr'):
                cols = row.find_all(['th', 'td'])
                if len(cols) == 2:
                    specs[cols[0].text.strip()] = cols[1].text.strip()
        product['specifications'] = specs

        return product

    def scrape_website(self):
        """Main scraping function"""
        print("Starting website scrape...")


        soup = self.get_soup(self.base_url)
        if not soup:
            return None


        site_data = {
            'url': self.base_url,
            'scrape_time': datetime.now().isoformat(),
            'title': soup.title.string if soup.title else None,
            'meta_description': None,
            'contact_info': {},
            'social_links': {},
            'categories': [],
            'products': []
        }


        meta_desc = soup.find('meta', {'name': 'description'})
        if meta_desc:
            site_data['meta_description'] = meta_desc.get('content')


        contact_patterns = {
            'email': r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
            'phone': r'[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}'
        }

        for contact_type, pattern in contact_patterns.items():
            matches = re.findall(pattern, str(soup))
            if matches:
                site_data['contact_info'][contact_type] = matches[0]


        social_platforms = ['facebook', 'twitter', 'instagram', 'youtube', 'linkedin']
        for platform in social_platforms:
            social_links = soup.find_all('a', href=lambda x: x and platform in x.lower())
            if social_links:
                site_data['social_links'][platform] = social_links[0]['href']


        category_menu = soup.find_all(['nav', 'ul'], class_=['menu', 'categories'])
        for menu in category_menu:
            categories = menu.find_all('a')
            site_data['categories'].extend([cat.text.strip() for cat in categories])


        product_links = self.extract_product_links(soup)
        print(f"Found {len(product_links)} product links")


        for i, product_url in enumerate(product_links):
            print(f"Scraping product {i+1}/{len(product_links)}: {product_url}")
            product_data = self.extract_product_details(product_url)
            if product_data:
                site_data['products'].append(product_data)
            time.sleep(1)

        return site_data

SAVING THE DATA IN GOOGLE DRIVE AND IN JSON FORMAT

In [None]:
def save_to_drive(data, folder_name='ecommerce_data'):
    """Save scraped data to Google Drive"""

    drive_path = f'/content/drive/My Drive/{folder_name}'
    if not os.path.exists(drive_path):
        os.makedirs(drive_path)

    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')


    json_path = f'{drive_path}/site_data_{timestamp}.json'
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)


    if data['products']:
        df_products = pd.json_normalize(data['products'])
        csv_path = f'{drive_path}/products_{timestamp}.csv'
        df_products.to_csv(csv_path, index=False)

    t
    summary = {
        'scrape_time': data['scrape_time'],
        'total_products': len(data['products']),
        'categories': data['categories'],
        'contact_info': data['contact_info'],
        'social_links': data['social_links']
    }

    summary_path = f'{drive_path}/summary_{timestamp}.json'
    with open(summary_path, 'w', encoding='utf-8') as f:
        json.dump(summary, f, indent=4, ensure_ascii=False)

    print(f"\nData saved to Google Drive folder: {folder_name}")
    print(f"Files saved:")
    print(f"1. Complete data: site_data_{timestamp}.json")
    print(f"2. Products data: products_{timestamp}.csv")
    print(f"3. Summary: summary_{timestamp}.json")


MAIN EXECUTION

In [None]:

url = "https://dr.rashel.in/"
scraper = DrRashelScraper(url)
data = scraper.scrape_website()

if data:
    save_to_drive(data, 'dr_rashel_data')


    print("\n=== Scraping Summary ===")
    print(f"Total products found: {len(data['products'])}")
    print(f"Categories found: {len(data['categories'])}")
    print("Contact information found:", ', '.join(data['contact_info'].keys()))
    print("Social media links found:", ', '.join(data['social_links'].keys()))

Starting website scrape...
Found 33 product links
Scraping product 1/33: https://dr.rashel.in/products/neem-acne-pimple-patch
Scraping product 2/33: https://dr.rashel.in/collections/hyaluronic-acid-products
Scraping product 3/33: https://dr.rashel.in/collections/rice-water-products
Scraping product 4/33: https://dr.rashel.in/collections/coffee-products
Scraping product 5/33: https://dr.rashel.in/collections/charcoal-skincare-products
Scraping product 6/33: https://dr.rashel.in/collections/goat-milk-products-for-face-body
Scraping product 7/33: https://dr.rashel.in/collections/best-aloe-vera-products
Scraping product 8/33: https://dr.rashel.in/collections/ubtan-skincare-products
Scraping product 9/33: https://dr.rashel.in/collections/strawberry-skincare-products
Scraping product 10/33: https://dr.rashel.in/collections/mix-fruit-products-for-face-body
Scraping product 11/33: https://dr.rashel.in/collections/cucumber-products-for-face-body
Scraping product 12/33: https://dr.rashel.in/coll

PART 2- TRANSFORMERS IMPLEMENTATION FOR CHATBOT DEVELOPMENT AND EVALUATION


IMPORTING TRANSORMER LIBRARY

In [1]:
pip install pandas transformers



In [2]:
import pandas as pd
from datetime import datetime
from transformers import pipeline

 INITIALIZING LLM'S FOR CHATBOT SUGGESTION

In [3]:

chatbot_assistant = pipeline("text-generation", model="gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


SCRAPED URL'S

In [4]:

product_urls = [
    "https://dr.rashel.in/products/neem-acne-pimple-patch",
    "https://dr.rashel.in/collections/hyaluronic-acid-products",
    "https://dr.rashel.in/collections/rice-water-products",
    "https://dr.rashel.in/collections/coffee-products",
    "https://dr.rashel.in/collections/charcoal-skincare-products",
    "https://dr.rashel.in/collections/goat-milk-products-for-face-body",
    "https://dr.rashel.in/collections/best-aloe-vera-products",
    "https://dr.rashel.in/collections/ubtan-skincare-products",
    "https://dr.rashel.in/collections/strawberry-skincare-products",
    "https://dr.rashel.in/collections/mix-fruit-products-for-face-body",
    "https://dr.rashel.in/collections/cucumber-products-for-face-body",
    "https://dr.rashel.in/collections/coconut-products-for-face-body",
    "https://dr.rashel.in/collections/papaya-products-for-face-body",
    "https://dr.rashel.in/collections/onion-products",
    "https://dr.rashel.in/collections/avocado-products-for-face-body",
    "https://dr.rashel.in/collections/moringa-skincare-products",
    "https://dr.rashel.in/collections/buy-3-products-599",
    "https://dr.rashel.in/collections/best-sellers/products/de-tan-scrub-380-ml",
    "https://dr.rashel.in/collections/best-sellers/products/vitamin-c-face-serum-50-ml",
    "https://dr.rashel.in/collections/best-sellers/products/hyaluronic-acid-cream-380-ml",
    "https://dr.rashel.in/collections/best-sellers/products/bamboo-charcoal-nose-strips-for-blackheads-removal",
    "https://dr.rashel.in/collections/best-sellers/products/white-skin-cream-380-ml",
    "https://dr.rashel.in/collections/new-hydration-heros/products/aloe-vera-body-lotion",
    "https://dr.rashel.in/collections/new-hydration-heros/products/cocoa-butter-body-lotion",
    "https://dr.rashel.in/collections/new-hydration-heros/products/smooth-moisturizing-cream",
    "https://dr.rashel.in/collections/new-hydration-heros/products/vitamin-c-moisturizing-cream",
    "https://dr.rashel.in/collections/new-hydration-heros/products/hyaluronic-acid-body-lotion",
]

GENERATE PRODUCT DATA

In [5]:

product_data = pd.DataFrame(product_urls, columns=["url"])
product_data["product_id"] = [f"P{str(i+1).zfill(2)}" for i in range(len(product_urls))]
product_data["title"] = product_data["url"].apply(lambda x: x.split("/")[-1].replace("-", " ").title())

CREATING EVENT DATA

In [6]:

event_data = [
    {
        "event": "product_viewed",
        "time": "2025-01-14 09:00:00",
        "customer_id": "C001",
        "href": "utm_source=facebook",
        "url": product_urls[0],
        "title": "Neem Acne Pimple Patch"
    },
    {
        "event": "chatbot:open",
        "time": "2025-01-14 09:05:00",
        "customer_id": "C001",
        "href": "utm_source=facebook",
        "url": "https://dr.rashel.in/home",
        "title": "Homepage"
    },
    {
        "event": "message_sent",
        "time": "2025-01-14 09:06:00",
        "customer_id": "C001",
        "href": "utm_source=facebook",
        "url": "https://dr.rashel.in/home",
        "title": "Homepage",
        "message": "What products help with acne?"
    },
    {
        "event": "message_received",
        "time": "2025-01-14 09:06:10",
        "customer_id": "C001",
        "href": "utm_source=facebook",
        "url": "https://dr.rashel.in/home",
        "title": "Homepage",
        "message": "Our Neem Acne Pimple Patch is great for acne treatment."
    },
    {
        "event": "product_added_to_cart",
        "time": "2025-01-14 09:10:00",
        "customer_id": "C001",
        "href": "utm_source=facebook",
        "url": product_urls[0],
        "title": "Neem Acne Pimple Patch"
    },
    {
        "event": "checkout_started",
        "time": "2025-01-14 09:15:00",
        "customer_id": "C001",
        "href": "utm_source=facebook",
        "url": "https://dr.rashel.in/checkout",
        "title": "Checkout"
    },
    {
        "event": "checkout_completed",
        "time": "2025-01-14 09:20:00",
        "customer_id": "C001",
        "href": "utm_source=facebook",
        "url": "https://dr.rashel.in/checkout",
        "title": "Checkout"
    }
]


LOAD EVENT DATA INTO DATAFRAME

In [7]:

df_events = pd.DataFrame(event_data)

CONVERT TIME COLUMN TO DATETIME FOR ANALYSIS

In [8]:

df_events['time'] = pd.to_datetime(df_events['time'])


ANALYZE USER BEHAVIOUR

In [9]:

def analyze_user_behavior(dataframe):
    """
    Analyze user journey and interactions with the website.
    """
    customer_groups = dataframe.groupby("customer_id")
    for customer, events in customer_groups:
        print(f"\nCustomer ID: {customer}")
        events = events.sort_values(by="time")
        for _, event in events.iterrows():
            print(f"  {event['time']} - Event: {event['event']} - Page: {event['title']}")

ENHANCING CHATBOT WITH LLM SUGGESTIONS

In [10]:

def generate_chatbot_response(query):
    """
    Use LLM to generate chatbot responses based on user queries.
    """
    prompt = (
        f"The customer asked: '{query}'. "
        "Provide a helpful response that guides them to purchase products related to acne treatment."
    )
    response = chatbot_assistant(prompt, max_length=100, num_return_sequences=1)
    return response[0]["generated_text"]

In [11]:

if __name__ == "__main__":
    print("Analyzing User Behavior:")
    analyze_user_behavior(df_events)

    print("\nSimulating Chatbot Response:")
    user_query = "What products help with acne?"
    chatbot_reply = generate_chatbot_response(user_query)
    print(f"Chatbot Response: {chatbot_reply}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Analyzing User Behavior:

Customer ID: C001
  2025-01-14 09:00:00 - Event: product_viewed - Page: Neem Acne Pimple Patch
  2025-01-14 09:05:00 - Event: chatbot:open - Page: Homepage
  2025-01-14 09:06:00 - Event: message_sent - Page: Homepage
  2025-01-14 09:06:10 - Event: message_received - Page: Homepage
  2025-01-14 09:10:00 - Event: product_added_to_cart - Page: Neem Acne Pimple Patch
  2025-01-14 09:15:00 - Event: checkout_started - Page: Checkout
  2025-01-14 09:20:00 - Event: checkout_completed - Page: Checkout

Simulating Chatbot Response:
Chatbot Response: The customer asked: 'What products help with acne?'. Provide a helpful response that guides them to purchase products related to acne treatment.

The customer asked: 'How is acne so common? What can I do to help?'

The customer responded: 'Acne is all over with people who are trying to lose weight'.

In order to reduce the number and severity of acne, there is a series of products called "foods that can help with acne" which