In [1]:
import fasttext
import re
import time  # Import time module for adding a delay
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import StaleElementReferenceException

# Load your FastText model
model = fasttext.load_model('fasttext_k4.bin')  # 3 # 3
#model = fasttext.load_model('fasttext_model_fold3.bin') # 1 #2
#model = fasttext.load_model('model_dark_pattern1.bin') #2 #1
# List of words to filter out
filter_words = set([
    "english", "hindi", "bengali", "kannada", "german", "marathi", "language"
])

# Preprocess function
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s\'₹$%]', ' ', text)
    keep_patterns = [
        r'\b\d+%?\b', r'₹\d+', r'\$\d+', 
        r'\d+ (?:star rating|reviews?|trusted by|bought by|in the last \d+ (?:hours|days))', 
        r'\d+% off', r'upto ₹\d+', r'upto \$\d+'
    ]
    remove_patterns = [
        r'\b\d+ ?(gb|mah|cm|inch|inches|mp|gen|year|years|months|month|ram|rom|display|battery|processor|platform)\b'
    ]
    for pattern in remove_patterns:
        text = re.sub(pattern, ' ', text)
    
    def replace_numbers(match):
        if any(re.search(pattern, match.group(0)) for pattern in keep_patterns):
            return match.group(0)
        else:
            return " "
    
    text = re.sub(r'\b\d+\b', replace_numbers, text)
    text = re.sub(' +', ' ', text)
    text = ''.join([char for char in text if ord(char) < 128])
    text = text.strip()

    if any(word in text for word in filter_words):
        return ""
    
    return text

# Set up Chrome options to run in headless mode
chrome_options = Options()
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize Selenium WebDriver with headless mode
browser = webdriver.Chrome(options=chrome_options)

# URL of the e-commerce website to scrape
url = "https://www.amazon.in/"
#url = "https://www.amazon.in/s?k=bedsheets"
browser.get(url)

# Function to segment the DOM elements
def segmented(element):
    elements = element.find_elements(By.XPATH, ".//*")
    return elements

# Function to scrape the website, classify text, and highlight matched elements
def scrape_and_highlight():
    elements = segmented(browser.find_element(By.TAG_NAME, "body"))
    filtered_elements = []

    for el in elements:
        try:
            text = el.text.strip().replace("\t", " ")   
            if text:
                filtered_elements.append((el, text))
        except StaleElementReferenceException:
            continue
    
    for i, (el, text) in enumerate(filtered_elements):
        lines = text.split('\n')
        for line in lines:
            try:
                processed_text = preprocess(line).replace('\n', ' ')
                category = ''  # Initialize category here
                if processed_text:
                    label, score = model.predict(processed_text)
                    if label[0].split('__label__')[1] == 'yes':  # Check if the predicted label is 'yes'
                        category = 'dark pattern'
                        browser.execute_script("arguments[0].innerHTML = arguments[0].innerHTML.replace(arguments[1], '<span style=\"background-color: red;\">' + arguments[1] + '</span>');", el, line)
                    print(f"Element {i + 1}:\nOriginal Text: {line}\nPreprocessed Text: {processed_text}\nCategory: {category}\n")
            except StaleElementReferenceException:
                continue

# Start scraping and highlighting
scrape_and_highlight()

# Note: Do not close the browser to keep the highlighted elements visible
# browser.quit()


The chromedriver version (129.0.6668.58) detected in PATH at d:\Minor_Project\Models\chromedriver.exe might not be compatible with the detected chrome version (131.0.6778.86); currently, chromedriver 131.0.6778.85 is recommended for chrome 131.*, so it is advised to delete the driver in PATH and retry


SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 129
Current browser version is 131.0.6778.86 with binary path C:\Program Files\Google\Chrome\Application\chrome.exe
Stacktrace:
	GetHandleVerifier [0x00007FF675B3FDA5+29557]
	(No symbol) [0x00007FF675AB2240]
	(No symbol) [0x00007FF67596B6EA]
	(No symbol) [0x00007FF6759ADF67]
	(No symbol) [0x00007FF6759ACFFB]
	(No symbol) [0x00007FF6759A7B1E]
	(No symbol) [0x00007FF6759A268E]
	(No symbol) [0x00007FF6759F32E0]
	(No symbol) [0x00007FF6759F2970]
	(No symbol) [0x00007FF6759E72A3]
	(No symbol) [0x00007FF6759B12DF]
	(No symbol) [0x00007FF6759B2451]
	GetHandleVerifier [0x00007FF675E6DCBD+3363469]
	GetHandleVerifier [0x00007FF675EB9B47+3674391]
	GetHandleVerifier [0x00007FF675EAEAEB+3629243]
	GetHandleVerifier [0x00007FF675BFFC66+815670]
	(No symbol) [0x00007FF675ABD6EF]
	(No symbol) [0x00007FF675AB92B4]
	(No symbol) [0x00007FF675AB9450]
	(No symbol) [0x00007FF675AA81FF]
	BaseThreadInitThunk [0x00007FFA69A1259D+29]
	RtlUserThreadStart [0x00007FFA6BD4AF38+40]


In [2]:
import fasttext

model = fasttext.load_model('fasttext_k4.bin')





In [3]:
# Example: Extracting word vectors
word_vectors = {word: model.get_word_vector(word) for word in model.get_words()}


In [4]:
import joblib

joblib.dump(word_vectors, "model.pkl")


['model.pkl']

In [8]:
import fasttext

# Load the FastText model from the .bin file
model1 = fasttext.load_model('fasttext_k4.bin')

# Save the FastText model to a .bin file (or .ftz for compressed models)
model1.save_model('model_saved.ftz')  # You can use any name here




In [1]:
import fasttext

# Load the saved FastText model
model = fasttext.load_model('model_saved.ftz')

# Test the model with example tokens
tokens = ['free', 'limited', 'guarantee', 'terms', 'privacy', 'sale', 'exclusive']

# Predict the classification for each token (word)
output = []

for token in tokens:
    prediction = model.predict(token)  # Predict the class
    output.append({'word': token, 'predicted_label': prediction[0][0], 'confidence': prediction[1][0]})

# Print the results
for result in output:
    print(f"Word: {result['word']}, Predicted Label: {result['predicted_label']}, Confidence: {result['confidence']}")


Word: free, Predicted Label: __label__no, Confidence: 0.9985085725784302
Word: limited, Predicted Label: __label__no, Confidence: 0.9996528625488281
Word: guarantee, Predicted Label: __label__no, Confidence: 1.0000087022781372
Word: terms, Predicted Label: __label__no, Confidence: 1.0000098943710327
Word: privacy, Predicted Label: __label__no, Confidence: 1.0000077486038208
Word: sale, Predicted Label: __label__no, Confidence: 1.0000100135803223
Word: exclusive, Predicted Label: __label__yes, Confidence: 0.9889583587646484




In [10]:
import fasttext
import pickle

# Load the FastText model
model = fasttext.load_model('model_saved.ftz')

# Create a dictionary of word embeddings
word_vectors = {word: model.get_word_vector(word) for word in model.get_words()}

# Save the word vectors as a pickle file
with open('word_vectors.pkl', 'wb') as f:
    pickle.dump(word_vectors, f)

print("Word vectors saved to word_vectors.pkl")




Word vectors saved to word_vectors.pkl


In [11]:
import pickle
import numpy as np

# Load the word vectors from the pickle file
with open('word_vectors.pkl', 'rb') as f:
    word_vectors = pickle.load(f)

# Function to compute cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Example words to test
words = ["free", "exclusive", "guarantee", "sale", "privacy", "limited"]

# Check if the words are in the word_vectors dictionary and display their vectors
for word in words:
    if word in word_vectors:
        print(f"Word: {word}")
        print(f"Word Vector: {word_vectors[word]}\n")
    else:
        print(f"Word '{word}' not found in word vectors\n")

# Test similarity between two words (for example, "free" and "exclusive")
word1 = "free"
word2 = "exclusive"

if word1 in word_vectors and word2 in word_vectors:
    similarity = cosine_similarity(word_vectors[word1], word_vectors[word2])
    print(f"Cosine similarity between '{word1}' and '{word2}': {similarity}")
else:
    print(f"One or both words '{word1}' and '{word2}' not found in word vectors")


Word: free
Word Vector: [ 0.04877536 -0.02375953  0.03516376  0.01735046 -0.02450103  0.00037739
 -0.00280879  0.00366569 -0.05549864 -0.03751379  0.01586474 -0.03196466
 -0.03759388 -0.01514249  0.0253796   0.01736866 -0.01506189 -0.02573508
 -0.0099211   0.03536506  0.02191356 -0.01323203 -0.03631254  0.01061219
 -0.01699558  0.01204267  0.00584632  0.01537866  0.00081454 -0.03826044
  0.02379689  0.02068958  0.02678249  0.04294058  0.00491804 -0.00264826
 -0.02845364  0.03067598 -0.01239639  0.01501953 -0.01859472 -0.01072602
  0.00894872  0.01880982  0.00783888 -0.00730331  0.01213707  0.02213049
  0.00911718 -0.02457174]

Word: exclusive
Word Vector: [ 0.23515052 -0.21728584  0.13773201  0.05461797 -0.15660585  0.0341207
 -0.13063277  0.0158461  -0.27612948 -0.120155    0.01838774 -0.11512704
 -0.15364741 -0.07784876  0.10026268  0.00188856 -0.04848239 -0.16804135
  0.04946726  0.19939718  0.02923743 -0.20302361 -0.10049158  0.2030936
 -0.15904428  0.08179528 -0.0473648   0.020129

In [12]:
import pickle
from sklearn.externals import joblib
import numpy as np

# Load word embeddings and classifier
with open('word_vectors.pkl', 'rb') as f:
    word_vectors = pickle.load(f)

category_classifier = joblib.load('category_classifier.joblib')  # Assuming it's a scikit-learn classifier

# Function to get sentence embedding
def sentence_embedding(sentence):
    words = sentence.lower().split()
    word_vecs = []

    for word in words:
        if word in word_vectors:
            word_vecs.append(word_vectors[word])
    
    if not word_vecs:
        return None
    return np.mean(word_vecs, axis=0)

# Example sentence
sentence = "The swimming competition was exciting."
embedding = sentence_embedding(sentence)

if embedding is not None:
    # Use classifier to predict category for the sentence
    prediction = category_classifier.predict([embedding])
    print(f"Predicted Category: {prediction[0]}")
else:
    print("No word embeddings found for the sentence.")


ImportError: cannot import name 'joblib' from 'sklearn.externals' (c:\Users\DARREN CHAHAL\.conda\envs\myenv\lib\site-packages\sklearn\externals\__init__.py)