In [1]:
import requests
import re
import urllib.parse
from sentence_transformers import SentenceTransformer, util
import torch
import pandas as pd

In [10]:
model = SentenceTransformer('all-MiniLM-L6-v2')  

API_KEY = "your_api_key"
SEARCH_ENGINE_ID = "76c6f87422ff1434d"

In [11]:
def clean_latex_equations(text: str) -> str:
    """Convert LaTeX expressions to simplified readable form for PCMB content."""

    # 1. Remove dollar signs
    text = re.sub(r'\$', '', text)

    # 2. Chemistry & Physics formatting
    text = re.sub(r'\\mathrm{([A-Za-z0-9]+)}_{(\d+)}', r'\1_\2', text)  # e.g., \mathrm{M}_{1} -> M_1
    text = re.sub(r'\\mathrm{([A-Za-z0-9]+)}', r'\1', text)             # e.g., \mathrm{O} -> O

    # 3. Fractions and roots
    text = re.sub(r'\\frac\s*{(.*?)}{(.*?)}', r'(\1/\2)', text)         # \frac{a}{b} → (a/b)
    text = re.sub(r'\\sqrt\s*\[(.*?)\]\s*{(.*?)}', r'\1th root of (\2)', text)  # nth root
    text = re.sub(r'\\sqrt{(.*?)}', r'sqrt(\1)', text)                  # \sqrt{a} → sqrt(a)

    # 4. Subscripts and superscripts
    text = re.sub(r'_{(\d+)}', r'_\1', text)                            # _{2} → _2
    text = re.sub(r'\^\{(\d+)\}', r'^\1', text)                         # ^{2} → ^2
    text = re.sub(r'\^\{(.*?)\}', r'^\1', text)                         # ^{a+b} → ^a+b

    # 5. Trigonometry and functions
    text = re.sub(r'\\(sin|cos|tan|cot|sec|csc|log|ln)\b', r'\1', text)

    # 6. Absolute value
    text = re.sub(r'\\left\|(.+?)\\right\|', r'absolute value of (\1)', text)
    text = re.sub(r'\|(.+?)\|', r'absolute value of (\1)', text)

    # 7. Derivatives, integrals, sums
    text = re.sub(r'\\frac\s*{d}{dx}', 'd/dx', text)
    text = re.sub(r'\\int\s*_{(.*?)}\^{(.*?)}', r'integral from \1 to \2', text)
    text = re.sub(r'\\sum\s*_{(.*?)}\^{(.*?)}', r'sum from \1 to \2', text)
    text = re.sub(r'\\lim\s*_{(.*?)}', r'limit as \1 of', text)

    # 8. Chemistry-specific notation
    text = re.sub(r'([A-Za-z])_{(\d+)}', r'\1\2', text)                # H_{2}O → H2O
    text = re.sub(r'([A-Za-z0-9]+)\^\{([+-]?\d+)\}', r'\1^\2', text)   # SO_4^{2-} → SO4^2-
    text = re.sub(r'\\text\s*{(aq|s|l|g)}', r'(\1)', text)             # \text{g} → (g)
    text = re.sub(r'\\rightarrow|\\to', '→', text)
    text = re.sub(r'\\rightleftharpoons', '⇌', text)

    # 9. Physics-specific notation
    text = re.sub(r'\\vec\s*{([a-zA-Z])}', r'\1 (vector)', text)
    text = re.sub(r'\\degree|\^\\circ', '°', text)
    text = re.sub(r'\\times\s*10\^\{(-?\d+)\}', r'×10^\1', text)       # scientific notation

    # 10. General LaTeX cleanup
    text = re.sub(r'\\(left|right|cdot|times|angle|overline|bar)', '', text)
    text = re.sub(r'\\[a-zA-Z]+', '', text)  # Remove any remaining LaTeX commands

    # 11. Collapse whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [12]:
def clean_question(text: str) -> str:
    """Normalize, strip, and lowercase the input question."""
    text = clean_latex_equations(text)
    print(f"\n cleaned_latex: {text}")
    return " ".join(text.strip().split()).lower()

In [13]:
def get_search_results(query: str, top_n: int = 100):
    """Query Google Custom Search API and return title/snippet/link tuples."""
    try:
        encoded_query = urllib.parse.quote_plus(query)
        url = f"https://www.googleapis.com/customsearch/v1?q={encoded_query}&key={API_KEY}&cx={SEARCH_ENGINE_ID}"
        response = requests.get(url)
        response.raise_for_status()
        items = response.json().get("items", [])[:top_n]

        return [(item.get("title", ""), item.get("snippet", ""), item.get("link", "")) for item in items]

    except requests.exceptions.RequestException as e:
        print(f"[ERROR] Google API request failed: {e}")
        return []

In [14]:
def check_similarity(original_question: str, candidates: list, threshold: float = 0.8):
    """Compute semantic similarity between original question and candidate snippets."""
    if not candidates:
        return False, 0.0, None

    original_embedding = model.encode(original_question, convert_to_tensor=True)

    best_score = 0.0
    best_link = None

    for title, snippet, link in candidates:
        combined_text = f"{title}. {snippet}".strip()
        result_embedding = model.encode(combined_text, convert_to_tensor=True)

        similarity = util.cos_sim(original_embedding, result_embedding).item()
        if similarity > best_score:
            best_score = similarity
            best_link = link

        if similarity >= threshold:
            return True, similarity, link

    return False, best_score, best_link

In [19]:
def check_question_existence(question: str, similarity_threshold: float = 0.80):
    """Main function: checks whether a given question exists online using semantic similarity."""
    # print(f"\n Checking:\n{question}")
    cleaned_question = clean_question(question)

    results = get_search_results(cleaned_question)

    if not results:
        print(" No search results found.")
        return "No", 0.0

    found, score, link = check_similarity(cleaned_question, results, threshold=similarity_threshold)

    if found:
        print(f" Question Found Online!\n {link}\n Similarity Score: {score:.2f}")
        return "Yes", round(score, 3)
    else:
        print(f" Not Found (Max Similarity = {score:.2f} < {similarity_threshold})")
        if link:
            print(f" Closest Match: {link}")
        return "No", round(score, 3)

In [20]:
def process_csv(input_csv: str, question_column: str = "question", output_csv: str = "checked_questions.csv"):
    df = pd.read_csv(input_csv)

    if question_column not in df.columns:
        raise ValueError(f"Column '{question_column}' not found in CSV.")

    found_list = []
    score_list = []

    for idx, question in enumerate(df[question_column]):
        if not isinstance(question, str) or not question.strip():
            found_list.append("Invalid")
            score_list.append(0.0)
            continue

        print(f"\n[{idx+1}] Checking Question:\n{question[:100]}...")
        found, score = check_question_existence(question)
        found_list.append(found)
        score_list.append(score)

    df["Found_Online"] = found_list
    df["Similarity_Score"] = score_list

    df.to_csv(output_csv, index=False)
    print(f"\n Results written to: {output_csv}")

In [21]:
if __name__ == "__main__":
    process_csv("input_questions_1.csv", question_column="question", output_csv="checked_questions.csv")


[1] Checking Question:
A block of mass m = 2 kg is given an initial speed vi = 5 m/s and slides down a frictional incline t...

 cleaned_latex: A block of mass m = 2 kg is given an initial speed vi = 5 m/s and slides down a frictional incline that makes an angle θ = 30 degrees with the horizontal. The coefficient of kinetic friction between the block and the incline is μk = 0.2. The incline is surrounded by a large cylindrical surface of radius R = 5 m. After the block slides down the incline, it will collide with the cylindrical surface at the point of intersection and move along the inner surface of the cylinder. The cylinder is fixed and does not rotate. Given that the block slides along the cylindrical surface without slipping, find the linear velocity of the block immediately after it changes direction from sliding down the incline to moving along the cylindrical surface.(Assume Earth's gravitational acceleration g = 9.81 m/s².)
 Not Found (Max Similarity = 0.68 < 0.8)
 Closest M

In [None]:
# if __name__ == "__main__":
#     que =[
#         r"The density of a solid ball is to be determined in an experiment. The diameter of the ball is measured with a screw gauge, whose pitch is 0.5 mm and there are 50 divisions on the circular scale. The reading on the main scale is 2.5 mm and that on the circular scale is 20 divisions. If the measured mass of the ball has a relative error of 2%, the relative percentage error in the density is:",
#         r"Let $C_1$ be the circle defined by the equation $x^2 + y^2 = 3$, with center at the origin $O$. This circle intersects the parabola $x^2 = 2y$ at a point $P$ located in the first quadrant. Let the tangent line to circle $C_1$ at point $P$ intersect circles $C_2$ and $C_3$ at points $R_2$ and $R_3$, respectively. Suppose that circles $C_2$ and $C_3$ have equal radii, $r = 2\sqrt{3}$, and centers $Q_2$ and $Q_3$, respectively. Furthermore, assume that the centers $Q_2$ and $Q_3$ lie on the $y$-axis.",
#         r"Consider a concave mirror and a convex lens (refractive index $=1.5$ ) of focal length 10 cm each, separated by a distance of 50 cm in air r(refractive index $=1$ ) as shown in the figure. An object is placed at a distance of 15 cm from the mirror. Its erect image formed by this combination has magnification $\\mathrm{M}_{1}$. When the set- up is kept in a medium of refractive index 7/6, the magnification becomes $\\mathrm{M}_{2}$. The magnitude $\\left|\\frac{M_{2}}{M_{1}}\\right|$ is",
#         r"A concave mirror and a convex lens, each with a focal length of $f = 10\,\mathrm{cm}$, are separated by a distance $d = 50\,\mathrm{cm}$. The convex lens has a refractive index of $n_l = 1.5$. Initially, the system is immersed in air, which has a refractive index of $n_a = 1$. An object is positioned at a distance $u = 15\,\mathrm{cm}$ from the concave mirror. The resulting erect image formed by this optical system has a magnification denoted by $M_1$. Subsequently, the entire setup is immersed in a medium with a refractive index of $n_m = \frac{7}{6}$. In this new medium, the magnification of the erect image becomes $M_2$. Determine the magnitude of the ratio $\left| \frac{M_2}{M_1} \right|$.",
#         r"Consider a thermodynamic process where a substance transitions between two states, denoted as $\alpha$ and $\beta$. Given that the entropy difference, $\Delta S = S_{\beta} - S_{\alpha}$, is zero at absolute zero, $T = 0\,\mathrm{K}$, and that $\ln 2 \approx 0.69$, determine the value of $\Delta S$ (in $\mathrm{J\,mol^{-1}\,K^{-1}}$) when the temperature is $T = 300\,\mathrm{K}$."
#     ]
#     for q in que:
#         check_question_existence(q)
#         print('\n')