In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install unsloth
# Get latest Unsloth
!pip install --upgrade --no-deps "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct",
    max_seq_length = 8192,
    load_in_4bit = True,
)

In [None]:
from transformers import TextStreamer
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

In [None]:
from datasets import load_dataset

ds = load_dataset("stanfordnlp/imdb")
ds["train"][0]

In [None]:
messages = [
                               # EDIT HERE!
    {"from": "human", "value": "Classify the following movie review with sentiment analysis. Return 0 for negative and 1 for positive and nothing else.\n"+ds["train"][0]['text']},
]
inputs = tokenizer.apply_chat_template(messages, tokenize = True, add_generation_prompt = True, return_tensors = "pt").to("cuda")

text_streamer = TextStreamer(tokenizer)
output = model.generate(input_ids = inputs, max_new_tokens = 2, use_cache = True)
print(int(tokenizer.decode(output[0,-2])))
# _ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 1024, use_cache = True)
# tokenizer.batch_decode(_)

In [None]:
import os
import re
from typing import Dict, List, Tuple, Union
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import pos_tag
import nltk
nltk.download('averaged_perceptron_tagger_eng')
Graph = Dict[str, Dict[str, Union[int, float]]]
Sentence = List[str]

def parse(content: str) -> List[Sentence]:
    result = []

    content = re.sub(r'<.*?>', ' ', content)
    content = re.sub(r'[:;=8xX][-~]?[)(DPdOo/\\|*]+', ' ', content)
    content = re.sub(r'[^\w\s.,!?]', ' ', content)
    content = re.sub(r'\s+', ' ', content).strip()
    sentences = re.split(r'[.!?]', content)
    sentences = [s.strip() for s in sentences if s.strip()]

    for sentence in sentences:
        if len(re.findall(r'\b\w+\b', sentence)) == 0:
            continue

        tokens = ['<START>']
        words = re.findall(r'\b\w+\b', sentence.lower())
        tokens.extend(words)
        tokens.append('<END>')
        result.append(tokens)

    return result

def encode(sentences: List[Sentence]) -> Tuple[Graph, Dict]:
    graph = {}
    for sentence in sentences:
        prev = sentence[0]
        for current, next_ in zip(sentence[1:-1], sentence[2:]):
            if prev not in graph:
                graph[prev] = {}
            graph[prev][current] = graph[prev].get(current, 0) + 1
            prev = current
        if prev not in graph:
            graph[prev] = {}
        graph[prev][sentence[-1]] = graph[prev].get(sentence[-1], 0) + 1
    return graph, {}

def weight_graph(graph: Graph, sentences: List[Sentence]) -> Graph:
    weighted_graph = {}
    all_tokens = [' '.join(sentence) for sentence in sentences]

    if not all_tokens:
        print("Warning: No valid tokens for TF-IDF calculation.")
        return weighted_graph

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(all_tokens)
    tfidf_scores = {word: tfidf for word, tfidf in zip(vectorizer.get_feature_names_out(), tfidf_matrix.max(axis=0).toarray()[0])}

    for tail, heads in graph.items():
        total = sum(heads.values())
        weighted_graph[tail] = {}
        for head, count in heads.items():
            pos = pos_tag([head])[0][1]
            if pos in ['JJ', 'RB']: 
                tfidf_weight = tfidf_scores.get(head, 1.0) * 3.0 
            elif pos in ['NN', 'VB']: 
                tfidf_weight = tfidf_scores.get(head, 1.0) * 2.0
            else:
                tfidf_weight = tfidf_scores.get(head, 1.0)

            weighted_graph[tail][head] = (1 - (count / total)) * tfidf_weight
    return weighted_graph

def traverse(graph: Graph, max_len: int = 5) -> List[Sentence]:
    paths = []
    fringe = [(['<START>'], 0)]
    while fringe and len(paths) < max_len:
        path, cost = fringe.pop(0)
        tail = path[-1]
        if tail == '<END>' and len(path) > 3:
            paths.append((path, cost / len(path)))
            continue
        for head, weight in graph.get(tail, {}).items():
            if head not in path:
                fringe.append((path + [head], cost + weight))
    paths.sort(key=lambda x: x[1])
    return [p[0] for p in paths if p[0][-1] == '<END>']

def compress_content(content: str):
    sentences = re.split(r'[.!?]', content.strip())
    compressed = []
    for sentence in sentences:
        if len(sentence.split()) < 3:
            continue
        tokens = parse(sentence)
        graph, _ = encode(tokens)
        weighted_graph = weight_graph(graph, tokens)
        paths = traverse(weighted_graph)
        compressed.extend([' '.join(path[1:-1]) for path in paths])

    compressed_text = '\n'.join(list(set(compressed)))
    return compressed_text


In [None]:
i=12
print(i, ds["train"][i]['text'])
print(i, compress_content(ds["train"][i]['text']))

In [None]:
correct = 0
# for i in range(len(ds["train"])):
for i in range(1000):
  messages = [
      {"from": "human", "value": "Classify the following movie review with sentiment analysis. Return 0 for negative and 1 for positive and nothing else."+ds["train"][i]['text']},
  ]
  inputs = tokenizer.apply_chat_template(messages, tokenize = True, add_generation_prompt = True, return_tensors = "pt").to("cuda")

  text_streamer = TextStreamer(tokenizer)
  output = model.generate(input_ids = inputs, max_new_tokens = 2, use_cache = True)
  if(int(tokenizer.decode(output[0,-2])) == ds["train"][i]['label']):
    correct += 1
  if((i+1) % 100 == 0):
    print(correct,'correct in',i+1)
print(correct/len(ds["train"]))

In [None]:
tokenizer.batch_decode(output)

In [None]:
import os
import re
from typing import Dict, List, Tuple, Union
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import pos_tag
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
nltk.download('averaged_perceptron_tagger_eng')
Graph = Dict[str, Dict[str, Union[int, float]]]
Sentence = List[str]

def parse(content: str) -> List[Sentence]:
    result = []
    content = re.sub(r'<.*?>', ' ', content)
    content = re.sub(r'[:;=8xX][-~]?[)(DPdOo/\\|*]+', ' ', content)
    content = re.sub(r'[^\w\s.,!?]', ' ', content)
    content = re.sub(r'\s+', ' ', content).strip()
    sentences = re.split(r'[.!?]', content)
    sentences = [s.strip() for s in sentences if s.strip()]

    for sentence in sentences:
        if len(re.findall(r'\b\w+\b', sentence)) == 0:
            continue

        tokens = ['<START>']
        words = re.findall(r'\b\w+\b', sentence.lower())
        tokens.extend(words)
        tokens.append('<END>')
        result.append(tokens)

    return result

def encode(sentences: List[Sentence]) -> Tuple[Graph, Dict]:
    graph = {}
    for sentence in sentences:
        prev = sentence[0]
        for current, next_ in zip(sentence[1:-1], sentence[2:]):
            if prev not in graph:
                graph[prev] = {}
            graph[prev][current] = graph[prev].get(current, 0) + 1
            prev = current
        if prev not in graph:
            graph[prev] = {}
        graph[prev][sentence[-1]] = graph[prev].get(sentence[-1], 0) + 1
    return graph, {}

def weight_graph(graph: Graph, sentences: List[Sentence]) -> Graph:
    weighted_graph = {}
    all_tokens = [' '.join(sentence) for sentence in sentences]

    if not all_tokens:
        print("Warning: No valid tokens for TF-IDF calculation.")
        return weighted_graph

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(all_tokens)
    tfidf_scores = {word: tfidf for word, tfidf in zip(vectorizer.get_feature_names_out(), tfidf_matrix.max(axis=0).toarray()[0])}

    for tail, heads in graph.items():
        total = sum(heads.values())
        weighted_graph[tail] = {}
        for head, count in heads.items():
            pos = pos_tag([head])[0][1]
            if pos in ['JJ', 'RB']:
                tfidf_weight = tfidf_scores.get(head, 1.0) * 3.0
            elif pos in ['NN', 'VB']:
                tfidf_weight = tfidf_scores.get(head, 1.0) * 2.0
            else:
                tfidf_weight = tfidf_scores.get(head, 1.0)

            weighted_graph[tail][head] = (1 - (count / total)) * tfidf_weight
    return weighted_graph

def remove_duplicates(sentences: List[str]) -> List[str]:
    unique_sentences = []
    seen = set()
    for sentence in sentences:
        if sentence not in seen:
            seen.add(sentence)
            unique_sentences.append(sentence)
    return unique_sentences

def filter_sentences_by_similarity(sentences: List[str], threshold: float = 0.7) -> List[str]:
    if not sentences:
        return []

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)

    if tfidf_matrix.shape[1] == 0:
        print("Warning: Empty vocabulary; returning original sentences.")
        return sentences

    similarity_matrix = cosine_similarity(tfidf_matrix)
    selected_sentences = []
    selected_indices = set()
    for i in range(len(sentences)):
        if i in selected_indices:
            continue
        selected_sentences.append(sentences[i])
        selected_indices.add(i)
        for j in range(i + 1, len(sentences)):
            if similarity_matrix[i, j] > threshold:
                selected_indices.add(j)
    return selected_sentences

def traverse(graph: Graph, max_len: int = 5) -> List[Sentence]:
    paths = []
    fringe = [(['<START>'], 0)]
    while fringe and len(paths) < max_len:
        path, cost = fringe.pop(0)
        tail = path[-1]
        if tail == '<END>' and len(path) > 3:
            paths.append((path, cost / len(path)))
            continue
        for head, weight in graph.get(tail, {}).items():
            if head not in path:
                fringe.append((path + [head], cost + weight))
    paths.sort(key=lambda x: x[1])
    return [p[0] for p in paths if p[0][-1] == '<END>']

def process_file(input_path: str, output_path: str):
    print(f"Processing file: {input_path}")
    with open(input_path, 'r', encoding='utf-8') as f:
        content = f.read()


    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(compressed_text)

def compress_content(content: str):
    sentences = re.split(r'[.!?]', content.strip())
    compressed = []
    for sentence in sentences:
        if len(sentence.split()) < 3:
            continue
        tokens = parse(sentence)
        graph, _ = encode(tokens)
        weighted_graph = weight_graph(graph, tokens)
        paths = traverse(weighted_graph)
        compressed.extend([' '.join(path[1:-1]) for path in paths])

    compressed = remove_duplicates(compressed)
    compressed = filter_sentences_by_similarity(compressed)
    compressed_text = '\n'.join(compressed)
    return compressed_text


In [None]:
incorrect = 0
correct = 0
unknown = 0
correct_ls = []
# for i in range(len(ds["train"])):
for i in range(1000):
  messages = [
                                # EDIT HERE!
      {"from": "human", "value": "Classify the following movie review with sentiment analyasis. Return 0 for negative and 1 for positive and nothing else.\n\n"+compress_content(ds["train"][i]['text'])},
  ]
  inputs = tokenizer.apply_chat_template(messages, tokenize = True, add_generation_prompt = True, return_tensors = "pt").to("cuda")

  text_streamer = TextStreamer(tokenizer)
  output = model.generate(input_ids = inputs, max_new_tokens = 2, use_cache = True)
  if tokenizer.decode(output[0,-2]) != '0' and tokenizer.decode(output[0,-2]) != '1':
    # print(i, compress_content(ds["train"][i]['text']))
    # print(i, ds["train"][i]['text'])
    unknown += 1
  elif int(tokenizer.decode(output[0,-2])) == ds["train"][i]['label']:
    correct += 1
    correct_ls.append(i)
  else:
    incorrect += 1
  if((i+1) % 100 == 0):
    print(f"Total: {i+1}. Correct: {correct}. Incorrect: {incorrect}. Unknown: {unknown}")
print(correct_ls)

In [None]:
i=13
print(i, len(ds["train"][i]['text']), ds["train"][i]['text'])
print(i, len(compress_content(ds["train"][i]['text'])), compress_content(ds["train"][i]['text']))