<a href="https://colab.research.google.com/github/Brunozml/artistotllm/blob/main/n_gram_similarity_updated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
"""
🎯 N-gram Navigator: Your Text's Best Friend! 🎯

Welcome to the N-gram similarity analyzer, where we slice and dice text into delicious
bite-sized chunks (n-grams) and compare them using the mighty TF-IDF powers! It's like
a word sandwich detector that tells you how similar two texts are based on their
ingredient combinations.

Pro tip: Works best with a cup of coffee and a sense of humor! ☕️🔍
"""

from collections import Counter
from typing import List, Tuple, Dict
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
import numpy as np
from typing import Tuple, List, Set
import requests # Import the requests library

# Download required NLTK data
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
# Download the specific resource needed by pos_tag for English
nltk.download('averaged_perceptron_tagger_eng')

def get_ngrams(text: str, n: int) -> List[str]:
    """Generate n-grams from text"""
    words = text.lower().split()
    return [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

def calculate_tfidf(texts: List[str]) -> Tuple[np.ndarray, List[str]]:
    """Calculate TF-IDF vectors for the texts"""
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    return tfidf_matrix.toarray(), vectorizer.get_feature_names_out()

def ngram_similarity(text1: str, text2: str, n: int = 2) -> Tuple[float, Dict]:
    """
    Compare two texts using n-gram overlap with TF-IDF weights.
    Returns a similarity score and the n-grams with their TF-IDF weights.
    """
    # Get n-grams
    ngrams1 = get_ngrams(text1, n)
    ngrams2 = get_ngrams(text2, n)

    # Join n-grams back to text for TF-IDF calculation
    text1_ngrams = ' '.join(ngrams1)
    text2_ngrams = ' '.join(ngrams2)

    # Calculate TF-IDF
    tfidf_matrix, feature_names = calculate_tfidf([text1_ngrams, text2_ngrams])

    # Calculate cosine similarity
    similarity = np.dot(tfidf_matrix[0], tfidf_matrix[1]) / (
        np.linalg.norm(tfidf_matrix[0]) * np.linalg.norm(tfidf_matrix[1])
    )

    # Get top weighted n-grams for each text
    def get_top_ngrams(tfidf_vector, feature_names, top_n=5):
        indices = np.argsort(tfidf_vector)[-top_n:]
        return {feature_names[i]: float(tfidf_vector[i]) for i in indices if tfidf_vector[i] > 0}

    return similarity, {
        "text1_top_ngrams": get_top_ngrams(tfidf_matrix[0], feature_names),
        "text2_top_ngrams": get_top_ngrams(tfidf_matrix[1], feature_names)
    }

def read_file(filepath):
    """Read text from a file (local or URL) and return its contents"""
    # Check if the filepath is a URL
    if filepath.startswith('http://') or filepath.startswith('https://'):
        try:
            response = requests.get(filepath)
            response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
            return response.text
        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL {filepath}: {e}")
            return None
    else:
        # Assume it's a local file path
        try:
            with open(filepath, 'r') as f:
                return f.read()
        except FileNotFoundError:
            print(f"Error opening local file {filepath}: File not found")
            return None

if __name__ == "__main__":
    # Example texts
    # text1 = "The cat quickly jumped over the lazy dog."
    # text2 = "A dog slowly walked under the tired cat."
    # Read the files
    data_path = 'https://raw.githubusercontent.com/Brunozml/artistotllm/main/data/raw/' # Use raw.githubusercontent.com for direct file access
    file1 = 'gpt_what_to_do.txt'
    file2 = 'hypewrite_what_to_do.txt'

    # Use the updated read_file function
    text1 = read_file(data_path + file1)
    text2 = read_file(data_path + file2)

    # Only proceed if both files were read successfully
    if text1 is not None and text2 is not None:
        # Compare texts
        similarity_score, gram_info = ngram_similarity(text1, text2)

        # Print results
        print(f"Text 1: {file1}")
        print(f"Text 2: {file2}")

        print(f"\nN-gram Similarity score: {similarity_score:.2f}")
        print("\nTop weighted n-grams in Text 1:")
        for ngram, weight in gram_info["text1_top_ngrams"].items():
            print(f"'{ngram}': {weight:.3f}")
        print("\nTop weighted n-grams in Text 2:")
        for ngram, weight in gram_info["text2_top_ngrams"].items():
            print(f"'{ngram}': {weight:.3f}")

Text 1: gpt_what_to_do.txt
Text 2: hypewrite_what_to_do.txt

N-gram Similarity score: 0.57

Top weighted n-grams in Text 1:
'of': 0.229
'your': 0.229
'and': 0.261
'to': 0.261
'the': 0.457

Top weighted n-grams in Text 2:
'do': 0.186
'what': 0.256
'to': 0.279
'you': 0.341
'the': 0.419


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
