In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Tokenize, remove stopwords, and lemmatize
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    processed_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(processed_tokens)

def calculate_similarity(text1, text2):
    # Preprocess texts
    processed_text1 = preprocess_text(text1)
    processed_text2 = preprocess_text(text2)
    
    # Create TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([processed_text1, processed_text2])
    
    # Calculate cosine similarity
    similarity_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    return similarity_score

def generate_response(user_input, reference_database):
    max_similarity = 0
    best_match = None
    
    for entry in reference_database:
        similarity = calculate_similarity(user_input, entry['text'])
        if similarity > max_similarity:
            max_similarity = similarity
            best_match = entry
    
    if max_similarity > 0.1:  
        response = f"Based on your input, here's information from {best_match['website']}:\n"
        response += best_match['text']
        response += f"\n\nSource: {best_match['website']}"
        return response
    else:
        return "No relevant information found on this topic."

# Sample reference database
reference_database = [
    {
        'text': "Machine learning is a method of data analysis that automates analytical model building. It is a branch of artificial intelligence based on the idea that systems can learn from data, identify patterns and make decisions with minimal human intervention.",
        'website': "https://www.sas.com/en_us/insights/analytics/machine-learning.html"
    },
    {
        'text': "Python is an interpreted, object-oriented, high-level programming language with dynamic semantics. Its high-level built in data structures, combined with dynamic typing and dynamic binding, make it very attractive for Rapid Application Development.",
        'website': "https://www.python.org/doc/essays/blurb/"
    },
    {
        'text': "Natural Language Processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.",
        'website': "https://en.wikipedia.org/wiki/Natural_language_processing"
    }
]

# Main program loop
while True:
    user_input = input("Enter your query (or 'quit' to exit): ")
    if user_input.lower() == 'quit':
        break
    
    response = generate_response(user_input, reference_database)
    print("\nResponse:")
    print(response)
    print("\n" + "-"*50 + "\n")

print("Thank you for using the text similarity detection system!")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Response:
Based on your input, here's information from https://en.wikipedia.org/wiki/Natural_language_processing:
Natural Language Processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.

Source: https://en.wikipedia.org/wiki/Natural_language_processing

--------------------------------------------------


Response:
Based on your input, here's information from https://en.wikipedia.org/wiki/Natural_language_processing:
Natural Language Processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.

Source: https://en.wikipedia.org/wiki/Natural_language_processing

------------------

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Tokenize, remove stopwords, and lemmatize
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    processed_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token.isalnum()]
    return ' '.join(processed_tokens)

def calculate_similarity(text1, text2):
    # Preprocess texts
    processed_text1 = preprocess_text(text1)
    processed_text2 = preprocess_text(text2)
    
    # Create TF-IDF vectors
    vectorizer = TfidfVectorizer()
    
    # Handle empty strings
    if not processed_text1 or not processed_text2:
        return 0
    
    tfidf_matrix = vectorizer.fit_transform([processed_text1, processed_text2])
    
    # Calculate cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    
    # Calculate Jaccard similarity
    set1 = set(processed_text1.split())
    set2 = set(processed_text2.split())
    
    if not set1 or not set2:
        jaccard_sim = 0
    else:
        jaccard_sim = len(set1.intersection(set2)) / len(set1.union(set2))
    
    # Calculate fuzzy ratio
    fuzzy_ratio = fuzz.token_set_ratio(processed_text1, processed_text2) / 100
    
    # Return the maximum of cosine, Jaccard, and fuzzy similarities
    return max(cosine_sim, jaccard_sim, fuzzy_ratio)

def generate_response(user_input, reference_database):
    max_similarity = 0
    best_match = None
    
    for entry in reference_database:
        similarity = calculate_similarity(user_input, entry['text'])
        if similarity > max_similarity:
            max_similarity = similarity
            best_match = entry
    
    if max_similarity > 0.1:  # Lower threshold for better matching
        response = f"Based on your input, here's information from {best_match['website']}:\n"
        response += best_match['text']
        response += f"\n\nSource: {best_match['website']}"
        response += f"\n(Similarity score: {max_similarity:.2f})"
        return response
    else:
        return f"No relevant information found on this topic. (Best match score: {max_similarity:.2f})"

# Expanded reference database
reference_database = [
    {
        'text': "Machine learning is a method of data analysis that automates analytical model building. It is a branch of artificial intelligence based on the idea that systems can learn from data, identify patterns and make decisions with minimal human intervention.",
        'website': "https://www.sas.com/en_us/insights/analytics/machine-learning.html"
    },
    {
        'text': "Python is an interpreted, object-oriented, high-level programming language with dynamic semantics. Its high-level built in data structures, combined with dynamic typing and dynamic binding, make it very attractive for Rapid Application Development.",
        'website': "https://www.python.org/doc/essays/blurb/"
    },
    {
        'text': "Natural Language Processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.",
        'website': "https://en.wikipedia.org/wiki/Natural_language_processing"
    },
    {
        'text': "Machine learning is a subset of artificial intelligence that focuses on the development of algorithms and statistical models that enable computer systems to improve their performance on a specific task through experience.",
        'website': "https://www.ibm.com/cloud/learn/machine-learning"
    },
    {
        'text': "Python programming language is widely used in data science, web development, automation, scientific computing, and artificial intelligence applications. It's known for its readability and simplicity.",
        'website': "https://www.python.org/about/"
    },
    {
        'text': "NLP techniques are used in various applications including chatbots, sentiment analysis, language translation, text summarization, and speech recognition systems.",
        'website': "https://www.datarobot.com/blog/what-is-natural-language-processing-nlp/"
    }
]

# Main program loop
def main():
    print("Text Similarity Detection System")
    print("--------------------------------")
    print("This system will find relevant information based on your query.")
    print("Type 'quit' to exit the program.\n")
    
    while True:
        user_input = input("Enter your query: ")
        if user_input.lower() == 'quit':
            break
        
        response = generate_response(user_input, reference_database)
        print("\nResponse:")
        print(response)
        print("\n" + "-"*50 + "\n")

    print("Thank you for using the text similarity detection system!")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Text Similarity Detection System
--------------------------------
This system will find relevant information based on your query.
Type 'quit' to exit the program.


Response:
Based on your input, here's information from https://www.python.org/doc/essays/blurb/:
Python is an interpreted, object-oriented, high-level programming language with dynamic semantics. Its high-level built in data structures, combined with dynamic typing and dynamic binding, make it very attractive for Rapid Application Development.

Source: https://www.python.org/doc/essays/blurb/
(Similarity score: 1.00)

--------------------------------------------------


Response:
No relevant information found on this topic. (Best match score: 0.00)

--------------------------------------------------


Response:
No relevant information found on this topic. (Best match score: 0.00)

--------------------------------------------------

Thank you for using the text similarity detection system!


In [7]:
pip install fuzzywuzzy


Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Note: you may need to restart the kernel to use updated packages.


In [11]:
pip install selenium

Collecting seleniumNote: you may need to restart the kernel to use updated packages.

  Downloading selenium-4.30.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium)
  Downloading attrs-25.3.0-py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Using cached outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Using cached wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.30.0-py3-none-any.whl (9.4 MB)
   ---------------------------------------- 0.0/9.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.4 MB ? eta -:--:--
   ---------------------------------------- 0.1/9.4 MB 1.1 MB/s eta 0:00:09
   

In [17]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Tokenize, remove stopwords, and lemmatize
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    processed_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token.isalnum()]
    return ' '.join(processed_tokens)

def calculate_similarity(text1, text2):
    # Preprocess texts
    processed_text1 = preprocess_text(text1)
    processed_text2 = preprocess_text(text2)
    
    # Create TF-IDF vectors
    vectorizer = TfidfVectorizer()
    
    # Handle empty strings
    if not processed_text1 or not processed_text2:
        return 0
    
    tfidf_matrix = vectorizer.fit_transform([processed_text1, processed_text2])
    
    # Calculate cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    
    # Calculate Jaccard similarity
    set1 = set(processed_text1.split())
    set2 = set(processed_text2.split())
    
    if not set1 or not set2:
        jaccard_sim = 0
    else:
        jaccard_sim = len(set1.intersection(set2)) / len(set1.union(set2))
    
    # Calculate fuzzy ratio
    fuzzy_ratio = fuzz.token_set_ratio(processed_text1, processed_text2) / 100
    
    # Return the maximum of cosine, Jaccard, and fuzzy similarities
    return max(cosine_sim, jaccard_sim, fuzzy_ratio)

def search_web(query, num_results=5):
    """
    Search the web for information related to the query
    """
    # For a real implementation, you might use a search API like Google Custom Search or Bing Search
    # Here we'll use a simple approach by directly scraping some educational websites
    
    # Define a list of websites to search
    websites = [
        f"https://en.wikipedia.org/wiki/{query.replace(' ', '_')}",
        f"https://www.britannica.com/search?query={query.replace(' ', '+')}",
        f"https://www.sciencedirect.com/search?qs={query.replace(' ', '+')}",
        f"https://scholar.google.com/scholar?q={query.replace(' ', '+')}"
    ]
    
    results = []
    
    # Define headers to mimic a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
    }
    
    for url in websites:
        try:
            response = requests.get(url, headers=headers, timeout=10)
            if response.status_code == 200:
                # Parse the HTML content
                soup = BeautifulSoup(response.text, 'html.parser')
                
                # Extract the title
                title = soup.title.string if soup.title else "No title found"
                
                # Extract main content (this varies by website)
                content = ""
                
                # For Wikipedia
                if "wikipedia.org" in url:
                    paragraphs = soup.select("div.mw-parser-output p")
                    content = " ".join([p.get_text() for p in paragraphs[:3]])
                
                # For other websites, try to get main content
                else:
                    # Look for paragraphs
                    paragraphs = soup.find_all('p')
                    content = " ".join([p.get_text() for p in paragraphs[:5]])
                
                # Clean up the content
                content = re.sub(r'\s+', ' ', content).strip()
                
                # Get the domain for citation
                domain = urlparse(url).netloc
                
                # Add to results if content was found
                if content and len(content) > 100:
                    results.append({
                        'text': content[:1000],  # Limit content length
                        'website': url,
                        'title': title,
                        'domain': domain
                    })
                    
                    # Stop once we have enough results
                    if len(results) >= num_results:
                        break
                        
        except Exception as e:
            print(f"Error scraping {url}: {e}")
            continue
    
    return results

def extract_citation_info(url):
    """
    Extract citation information from a webpage
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract basic citation information
        title = soup.title.string if soup.title else "No title found"
        
        # Try to find author information
        author = None
        author_meta = soup.find('meta', {'name': ['author', 'Author']})
        if author_meta:
            author = author_meta.get('content')
        
        # Try to find publication date
        date = None
        date_meta = soup.find('meta', {'name': ['date', 'pubdate', 'publishdate', 'publication_date', 'article:published_time']})
        if date_meta:
            date = date_meta.get('content')
        
        # Get the domain
        domain = urlparse(url).netloc
        
        return {
            'title': title,
            'author': author,
            'date': date,
            'domain': domain,
            'url': url
        }
    except Exception as e:
        print(f"Error extracting citation from {url}: {e}")
        return {
            'title': "Unable to retrieve title",
            'author': None,
            'date': None,
            'domain': urlparse(url).netloc,
            'url': url
        }

def format_citation(citation_info):
    """
    Format citation information in APA style
    """
    title = citation_info.get('title', 'No title')
    author = citation_info.get('author', 'No author')
    date = citation_info.get('date', 'n.d.')
    domain = citation_info.get('domain', '')
    url = citation_info.get('url', '')
    
    # Format the date
    if date and len(date) > 4:
        try:
            # Try to extract year
            year = re.search(r'20\d{2}|19\d{2}', date).group(0)
            date = year
        except:
            date = 'n.d.'
    
    # Format the citation in APA style
    if author and author != 'No author':
        citation = f"{author}. ({date}). {title}. Retrieved from {domain}"
    else:
        citation = f"{title}. ({date}). Retrieved from {domain}"
    
    return citation

def generate_web_response(user_input):
    """
    Generate a response based on web search results
    """
    # Search the web for information
    search_results = search_web(user_input)
    
    if not search_results:
        return "No relevant information found on the web for your query."
    
    # Find the most relevant result
    max_similarity = 0
    best_match = None
    
    for result in search_results:
        similarity = calculate_similarity(user_input, result['text'])
        if similarity > max_similarity:
            max_similarity = similarity
            best_match = result
    
    if max_similarity > 0.1:
        # Extract citation information
        citation_info = extract_citation_info(best_match['website'])
        
        # Format the citation
        citation = format_citation(citation_info)
        
        # Generate the response
        response = f"Based on your input, here's information from {best_match['domain']}:\n\n"
        response += best_match['text']
        response += f"\n\nSource: {best_match['website']}"
        response += f"\nCitation: {citation}"
        response += f"\n(Similarity score: {max_similarity:.2f})"
        
        return response
    else:
        return f"No highly relevant information found on the web. (Best match score: {max_similarity:.2f})"

# Main program loop
def main():
    print("Web-Based Text Similarity Detection System")
    print("------------------------------------------")
    print("This system will search the web for information based on your query.")
    print("Type 'quit' to exit the program.\n")
    
    while True:
        user_input = input("Enter your query: ")
        if user_input.lower() == 'quit':
            break
        
        print("\nSearching the web for relevant information...")
        response = generate_web_response(user_input)
        print("\nResponse:")
        print(response)
        print("\n" + "-"*50 + "\n")

    print("Thank you for using the web-based text similarity detection system!")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Web-Based Text Similarity Detection System
------------------------------------------
This system will search the web for information based on your query.
Type 'quit' to exit the program.




Searching the web for relevant information...

Response:
No relevant information found on the web for your query.

--------------------------------------------------


Searching the web for relevant information...

Response:
Based on your input, here's information from en.wikipedia.org:

A bank is a financial institution that accepts deposits from the public and creates a demand deposit while simultaneously making loans, mobilizing saver surplus to deficit spenders.[1] Lending activities can be directly performed by the bank or indirectly through capital markets.[2] Whereas banks play an important role in financial stability and the economy of a country, most jurisdictions exercise a high degree of regulation over banks. Most countries have institutionalized a system known as fractional-reserve banking, under which banks hold liquid assets equal to only a portion of their current liabilities.[3] In addition to other regulations intended to ensure liquidity, banks are generally subject