<a href="https://colab.research.google.com/github/Brunozml/artistotllm/blob/main/pos_similarity_updated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install requests



In [7]:
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
import numpy as np
from typing import Tuple, List, Set
import requests # Import the requests library

# Download required NLTK data
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
# Download the specific resource needed by pos_tag for English
nltk.download('averaged_perceptron_tagger_eng')

def get_pos_distribution(text: str) -> dict:
    """
    Get the distribution of POS tags in the text.
    Returns a dictionary with POS tags as keys and their frequencies as values.
    """
    # Tokenize and get POS tags
    tokens = word_tokenize(text.lower())
    pos_tags = pos_tag(tokens)

    # Count POS tag frequencies
    pos_dist = {}
    for _, tag in pos_tags:
        pos_dist[tag] = pos_dist.get(tag, 0) + 1

    # Normalize frequencies
    total = sum(pos_dist.values())
    if total == 0: # Handle empty text case to avoid division by zero
        return {}
    for tag in pos_dist:
        pos_dist[tag] = pos_dist[tag] / total

    return pos_dist

def pos_similarity(text1: str, text2: str) -> Tuple[float, dict]:
    """
    Compare two texts based on their POS tag distributions.
    Returns a similarity score and the POS distributions.
    """
    # Get POS distributions
    dist1 = get_pos_distribution(text1)
    dist2 = get_pos_distribution(text2)

    # Get all unique POS tags
    all_tags = set(dist1.keys()) | set(dist2.keys())

    # Calculate cosine similarity
    vec1 = np.array([dist1.get(tag, 0) for tag in all_tags])
    vec2 = np.array([dist2.get(tag, 0) for tag in all_tags])

    # Handle cases where one or both vectors are zero vectors (e.g., empty texts)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)

    if norm_vec1 == 0 or norm_vec2 == 0:
        similarity = 0.0
    else:
        similarity = np.dot(vec1, vec2) / (norm_vec1 * norm_vec2)

    return similarity, {"text1_pos": dist1, "text2_pos": dist2}

def read_file(filepath):
    """Read text from a file (local or URL) and return its contents"""
    if filepath.startswith('http://') or filepath.startswith('https://'):
        # If it's a URL, use requests to fetch the content
        try:
            response = requests.get(filepath)
            response.raise_for_status() # Raise an exception for bad status codes
            return response.text
        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL {filepath}: {e}")
            return None # Or raise the exception
    else:
        # Otherwise, treat it as a local file path
        try:
            with open(filepath, 'r', encoding='utf-8') as f: # Added encoding for wider compatibility
                return f.read()
        except FileNotFoundError:
            print(f"Error: Local file not found at {filepath}")
            return None
        except Exception as e:
            print(f"Error reading local file {filepath}: {e}")
            return None


if __name__ == "__main__":
    # Example texts
    # text1 = "The cat quickly jumped over the lazy dog."
    # text2 = "A dog slowly walked under the tired cat."
    # Read the files
    # Construct the full URLs for the raw files
    data_path = 'https://raw.githubusercontent.com/Brunozml/artistotllm/main/data/raw/' # Use raw.githubusercontent.com for direct file access
    file1 = 'gpt_what_to_do.txt'
    file2 = 'hypewrite_what_to_do.txt'

    text1 = read_file(data_path + file1)
    text2 = read_file(data_path + file2)

    # Only proceed if both texts were successfully read
    if text1 is not None and text2 is not None:
        # Compare texts
        similarity_score, pos_distributions = pos_similarity(text1, text2)

        # Print results
        print(f"Text 1: {file1}")
        print(f"Text 2: {file2}")
        print(f"\nPOS Similarity score: {similarity_score:.2f}")
        # print("\nPOS Distribution Text 1:")
        # for pos, freq in pos_distributions["text1_pos"].items():
        #     print(f"{pos}: {freq:.2f}")
        # print("\nPOS Distribution Text 2:")
        # for pos, freq in pos_distributions["text2_pos"].items():
        #     print(f"{pos}: {freq:.2f}")
    else:
        print("Could not read one or both input files. Exiting.")


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


Text 1: gpt_what_to_do.txt
Text 2: hypewrite_what_to_do.txt

POS Similarity score: 0.88
