<a href="https://colab.research.google.com/github/AnKiTu03/NLP/blob/main/NLP_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import requests
from bs4 import BeautifulSoup
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.schema import HumanMessage
import json

def extract_elements(url):

    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, 'html.parser')

        forms_and_actions = []
        for form in soup.find_all('form'):
            forms_and_actions.append({
                "formHTML": str(form),
                "actionURL": form.get("action")
            })

        links = [a.get('href') for a in soup.find_all('a', href=True)]
        scripts = [str(script) for script in soup.find_all('script')]
        meta_info = ["{}={}".format(m.get('name'), m.get('content')) for m in soup.find_all('meta') if m.get('name')]
        title = soup.title.string if soup.title else ''
        text_content = soup.get_text()

        return {
            "title": title,
            "text_content": text_content,
            "forms_and_actions": forms_and_actions,
            "links": links,
            "meta_info": meta_info,
            "scripts": scripts
        }
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

def llm_content_check(url):
    api_key = "AIyQvk9cOqEQ"
    if not api_key:
        raise ValueError("GOOGLE_API_KEY environment variable is not set.")

    website_content = extract_elements(url)
    if not website_content:
        print("Failed to extract website content.")
        return

    llm = ChatGoogleGenerativeAI(model="gemini-pro", api_key=api_key)

    message = HumanMessage(
        content=(
            "You are acting as a Website Validator. "
            "You are acting as a Website Validator. Your task is to analyze the content of the provided website and determine whether it is legitimate or a scam.\n\n"
            '''Instructions:
               1. Carefully review the website content provided below, including its title, text content, links, forms, meta information, and scripts.
               2. Evaluate the legitimacy of the website based on these factors:
                  - Professional appearance and structure.
                  - Informative and relevant content.'''
            '''Provide the output strictly in JSON format with the following structure:
                {
                    Result: <Scam or Legitimate>,
                    Reasons: [
                        Reason 1,
                        Reason 2,
                        Reason 3,
                        Reason 4,
                        Reason 5
                      ],
                    Conclusion: <A one-liner conclusion summarizing your evaluation>
                }'''

            f"Website Content:\n{website_content}"
        )
    )

    response = llm.invoke([message])
    response = response.content
    print(response)

    if response.startswith("```json"):
        response = response[7:]
    if response.endswith("```"):
        response = response[:-3]
    parsed = json.loads(response)
    parsed = json.dumps(parsed, indent=4)
    return parsed


In [None]:
website = "https://www.msrit.edu"
llm_output = llm_content_check(website)


```json
{
  "Result": "Legitimate",
  "Reasons": [
    "Professional website design and structure.",
    "Informative and relevant content related to the institution.",
    "Presence of contact information and social media links.",
    "Valid meta information and scripts.",
    "No red flags or suspicious elements found."
  ],
  "Conclusion": "Based on the provided content, the website appears legitimate and represents Ramaiah Institute of Technology."
}
```


In [None]:
print(llm_output)

{
    "Result": "Legitimate",
    "Reasons": [
        "Professional website design and structure.",
        "Informative and relevant content related to the institution.",
        "Presence of contact information and social media links.",
        "Valid meta information and scripts.",
        "No red flags or suspicious elements found."
    ],
    "Conclusion": "Based on the provided content, the website appears legitimate and represents Ramaiah Institute of Technology."
}
