# Functions and libraries

In [1]:
import os
import pandas as pd
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    BitsAndBytesConfig,
)
from datasets import load_dataset
from datasets import Dataset
from tqdm import tqdm
from sklearn.metrics import f1_score
from openai import OpenAI
from getpass import getpass
import chromadb
from chromadb.config import Settings
import time
import requests
import uuid


client = OpenAI(api_key = getpass("Enter your OpenAI API Key: "))

In [2]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: nltk
Successfully installed nltk-3.9.1
[0m

In [3]:
topic_list = [
    "Amsterdam",
    "Brighton",
    "#Flames1stGoal",
    "Fabbro",
    "Remembrance Day",
    "Gretzky",
    "Liverpool",
    "Nunez",
    "Tony Todd",
    "Grammy",
]

In [12]:
import nltk
from nltk import pos_tag, word_tokenize, RegexpParser
from collections import Counter
import re
import math

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('treebank')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

# Define grammar for chunking
grammar = r"""
    NP: {<DT>?<JJ>*<NN.*>}    # Noun phrase
    VP: {<VB.*><NP|PP|CLAUSE>+$}  # Verb phrase
    PP: {<IN><NP>}              # Prepositional phrase
    ADJP: {<JJ><CC>*<JJ>*}      # Adjective phrase
    ADVP: {<RB.*>}              # Adverb phrase
"""

# Create a RegexpParser with the grammar
parser = RegexpParser(grammar)

def calculate_distributions(texts):
    pos_counts = Counter()
    constituent_counts = Counter()
    total_tokens = 0
    total_constituents = 0

    for i, text in enumerate(texts, start=1):
        # Tokenize and POS tag
        tokens = word_tokenize(text)
        pos_tags = pos_tag(tokens)

        # Calculate POS distribution
        for _, tag in pos_tags:
            pos_counts[tag] += 1
            total_tokens += 1

        # Parse for constituent distribution
        parse_tree = parser.parse(pos_tags)
        for subtree in parse_tree.subtrees():
            if subtree.label() in {"NP", "VP", "PP", "ADJP", "ADVP"}:
                constituent_counts[subtree.label()] += 1
                total_constituents += 1

        # Print progress every 1000 texts
        if i % 1000 == 0:
            print(f"Finished processing {i} texts")

    # Convert counts to percentages
    pos_distribution = {pos: count / total_tokens for pos, count in pos_counts.items()}
    constituent_distribution = {constituent: count / total_constituents
                                for constituent, count in constituent_counts.items()}

    # Combine both distributions in one dictionary
    combined_distribution = {**pos_distribution, **constituent_distribution}

    return combined_distribution


def cosine_similarity(dist1, dist2):
    # Calculate cosine similarity between two distributions
    all_keys = set(dist1.keys()).union(dist2.keys())
    vec1 = [dist1.get(key, 0) for key in all_keys]
    vec2 = [dist2.get(key, 0) for key in all_keys]

    dot_product = sum(a * b for a, b in zip(vec1, vec2))
    magnitude1 = math.sqrt(sum(a * a for a in vec1))
    magnitude2 = math.sqrt(sum(b * b for b in vec2))

    if magnitude1 == 0 or magnitude2 == 0:
        return 0  # Avoid division by zero
    return dot_product / (magnitude1 * magnitude2)


def remove_words_with_ampersand(input_text):
    # Split the text into words
    words = input_text.split()
    # Filter out words containing '&'
    filtered_words = [word for word in words if '&' not in word]
    # Join the remaining words back into a string
    return ' '.join(filtered_words)

def clean_tweet(tweet):
    # Remove trailing sequences of non-ASCII characters (like `����������������`)
    cleaned_tweet = tweet.replace("�", "")

    cleaned_tweet = re.sub(r"\(\)", "", cleaned_tweet)
    
    # change more than one dot to one dot
    cleaned_tweet = re.sub(r"\.{4,}", "...", cleaned_tweet)

    # change more than two question marks to two question marks
    cleaned_tweet = re.sub(r"\?{4,}", "???", cleaned_tweet)

    # change more than two ! to two !
    cleaned_tweet = re.sub(r"!{4,}", "!!!", cleaned_tweet)

    # Remove repeated double quotes at the end
    cleaned_tweet = re.sub(r'"+$', '', cleaned_tweet)

    # remove the word including &
    # cleaned_tweet = remove_words_with_ampersand(cleaned_tweet)

    return cleaned_tweet.strip()

def most_human_like_tweet(generated_tweets, benchmark):
    best_score = -1
    best_tweet = None

    for tweet in generated_tweets:
        # Calculate the distribution for the generated tweet
        cleaned_tweet = clean_tweet(tweet)
        tweet_distribution = calculate_distributions([cleaned_tweet])

        # Calculate similarity with the benchmark
        score = cosine_similarity(tweet_distribution, benchmark)

        # Check if this is the highest score
        if score > best_score:
            best_score = score
            best_tweet = cleaned_tweet

    return best_tweet

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [13]:
def detect(generated_tweet):
    prompt_suffix = "Let’s think step by step. Only return Yes or No, and don't provide any extra information."

    completion = client.chat.completions.create(
        model="gpt-4o",
        temperature=0,
        max_tokens=5,
        messages=[
            {"role": "system", "content": "You are a AI generated tweet Detector."},
            {
                "role": "user",
                "content": f"Is this tweet generated by Bot: ###{generated_tweet}###"
                + prompt_suffix,
            },
        ],
    )

    return completion.choices[0].message.content


def model_eval(
    model,
    tokenizer,
    topic_list,
    file_name,
    rounds_per_topic=10,
    few_shots=False,
    temperature=0.6,
    max_tokens=70,
    collection=None,
    post_processing=False
):

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )

    # Model selection args
    generation_args = {
        "max_new_tokens": max_tokens,
        "return_full_text": False,
        "temperature": temperature,
        "do_sample": True,
    }

    # Real world args
    generation_args = {
        "max_new_tokens": max_tokens,
        "return_full_text": False,
        "temperature": 0.4,
        "top_k": 50,
        "top_p": 0.9,
        "repetition_penalty": 1.2,
        "do_sample": True,
    }

    prompts, outputs, labels = [], [], []

    extra_messages = [
        {
            "role": "user",
            "content": "Could you generate a tweet about Taylor Swift?",
        },
        {
            "role": "assistant",
            "content": "Stockholm!!!! Thank you for being the most generous, excitable, magical crowds, and for breaking the all time attendance record for the stadium all 3 nights. Can’t believe this was our first time playing in Sweden - but it won’t be our last… 🇸🇪 🥰",
        },
        {
            "role": "user",
            "content": "Could you generate a tweet about Trump?",
        },
        {
            "role": "assistant",
            "content": "THANK YOU—READING, PENNSYLVANIA! We are just ONE DAY away from the best jobs, the biggest paychecks, and the brightest economic future the world has ever seen—but you must get out and VOTE! #FightForAmerica",
        },
        {
            "role": "user",
            "content": "Could you generate a tweet about Coca-cola?",
        },
        {
            "role": "assistant",
            "content": "WOW! TYSM everyone for all the birthday love 🫶  We want to celebrate with as many of you as we can, so we're officially turning this into a birthday month and will be responding back with additional wishes throughout the month! 🥳 #HaveACokeDay",
        },
    ]

    human_distribution_benchmark = {
        "NNP":0.11107245214850336,
        "CC":0.024262175839871215,
        "RB":0.05238026804527889,
        "PRP":0.05956264107107177,
        "VBP":0.0363054932074732,
        "VBN":0.011839412350151873,
        "NN":0.1409238945795302,
        "VB":0.04371257145342362,
        "DT":0.05979932216551541,
        "IN":0.07728031676817795,
        "CD":0.017169621871224054,
        "NNS":0.030351277605178115,
        ".":0.06337695703119336,
        "JJ":0.058788306252099715,
        "''":0.006478002525018547,
        ",":0.01874570995819159,
        "WDT":0.0016583434340378298,
        "VBZ":0.025435181210735677,
        "POS":0.003736787931848451,
        "TO":0.022256216896950124,
        "``":0.0035788954840238686,
        "PRP$":0.015399398558987169,
        "VBD":0.02242545490988785,
        "JJS":0.0018401876304346045,
        "VBG":0.021872358610622096,
        "MD":0.012826161360408895,
        "#":0.011843194205189587,
        "WRB":0.005464150220324571,
        "WP":0.0038455162641827445,
        "PDT":0.0007850500749122452,
        "RP":0.005637485242886488,
        "RBR":0.0009126876824351113,
        "JJR":0.002109644801871766,
        "$":0.000583035984980993,
        ":":0.0169212800570808,
        "(":0.002084432434953669,
        ")":0.0024471753639877898,
        "EX":0.0009987248845431172,
        "FW":0.00040875549865964755,
        "NNPS":0.0011373929025926507,
        "RBS":0.00027985727279087664,
        "UH":0.001336570601245617,
        "WP$":2.4897212331620784e-05,
        "SYM":9.265544842400646e-05,
        "LS":1.0084946767238798e-05,
        "NP":0.6919773154224048,
        "ADVP":0.1307694201308702,
        "ADJP":0.05708685343799667,
        "PP":0.11067809101217466,
        "VP":0.009488319996553618
    }


    for topic in tqdm(topic_list):
        prompt = f"Could you generate a tweet about {topic}?"
        # Check the RAG configuration
        if collection:
            prompt_rag = "You can use these background information as reference (Optional): " + collection.query(query_texts=[topic], n_results=1)["documents"][0][0]
        else:
            prompt_rag = ""

        messages = [{"role": "system", "content": "You are a helpful Tweet generator." + prompt_rag}]
        if few_shots:
            messages += extra_messages
            
        messages.append({"role": "user", "content": prompt + " 50 words maximum."})

        for _ in range(rounds_per_topic):
            
            # post_processing
            if post_processing:
                generated_tweets = [pipe(messages, **generation_args)[0]["generated_text"].strip() for _ in range(3)]
                output = most_human_like_tweet(generated_tweets, human_distribution_benchmark)
            else:
                output = pipe(messages, **generation_args)[0]["generated_text"].strip()
            label = detect(output)
            prompts.append(prompt_rag + prompt)
            outputs.append(output)
            labels.append(label)

    df = pd.DataFrame(
        {
            "prompt": prompts,
            pipe.model.name_or_path: outputs,
            "label": labels,
        }
    )

    df.to_csv(f"{file_name}.csv", index=False)
    print(f"Successful rate: {(df['label'] == 'No').sum() / len(df) * 100:.2f}%")

# Zero-shot Evaluation

Input: 30 tokens

Output: 70 tokens (50 words)

Evaluate one model 100 times: 100*100=10K tokens

10 models: 100K tokens


## Phi-3.5-mini-instruct (3.8B)

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3.5-mini-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
    attn_implementation="flash_attention_2"
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
model_eval(model, tokenizer, topic_list, file_name="3.8B", rounds_per_topic=10)

  0%|          | 0/10 [00:00<?, ?it/s]The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48
 10%|█         | 1/10 [00:23<03:30, 23.43s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 10/10 [04:09<00:00, 24.99s/it]

['"Explore Amsterdam\'s charming canals, historic architecture, and vibrant culture. #Amsterdam #CityLove #TravelGoals 🎨🍦💎🚲"', '"Exploring Amsterdam\'s charming canals, historic houses, and vibrant art scene. A city where heritage meets modernity, perfect for culture lovers. #AmsterdamAdventure #CityOfCanals"', '"Experience the charm of Amsterdam: canals, vibrant culture, and historic marvels. Visit the Anne Frank House, indulge in delicious Dutch treats, and take a magical boat ride. #AmsterdamAdventure #CityOfArt #DutchDelights �������', '"Exploring Amsterdam: A city where history, culture & canals intertwine. Iconic Anne Frank House, vibrant tulip markets, and cycling streets. #AmsterdamAdventures #CityOfCanals #UrbanCharm"', '"Exploring Amsterdam: a city where history whispers through cobblestone streets, canals reflect the vibrant tulip-filled gardens, and modern art meets old-world charm. #Amsterdam #CityOfWonders"', '"Exploring Amsterdam: a city where history whispers through co




## Phi-3-small-8k-instruct (7.4B)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-small-8k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
    attn_implementation="flash_attention_2"
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-small-8k-instruct")

In [4]:
model_eval(model, tokenizer, topic_list, file_name="7.4B", rounds_per_topic=10)

  x = [xi.to_sparse_csr() for xi in x]
 10%|█         | 1/10 [00:36<05:25, 36.20s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 10/10 [05:42<00:00, 34.29s/it]

['"Exploring Amsterdam\'s enchanting canals at dawn, where the city whispers secrets of history & culture. A perfect blend of art, bikes & tulips! #Amsterdam #DutchDelights 🚲🌷🇳🇱"', "Amsterdam: A city of enchanting canals, vibrant tulip fields, and rich history. Perfect for a romantic getaway or an adventurous exploration. Don't forget to try the famous stroopwafels! 🇳🇱🌷🚣\u200d♂️ #Amsterdam #TravelGoals #Stroop", '"Exploring Amsterdam\'s charming canals, cycling past historic architecture, and indulging in delicious stroopwafels. A city where the past and present beautifully intertwine. #Amsterdam #TravelGoals 🚲🍬🏞️"', '"Exploring the enchanting canals of Amsterdam, savoring delicious stroopwafels & bike rides through Vondelpark! This vibrant city blends rich history with modern charm. #Amsterdam #TravelGoals 🚲🍪🌉"', '"Exploring Amsterdam\'s picturesque canals, historic architecture, and vibrant tulip markets. A city where the past and present beautifully intertwine. #Amsterdam #TravelGoa




## microsoft/Phi-3-medium-4k-instruct (14B)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-medium-4k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
    attn_implementation="flash_attention_2"
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-medium-4k-instruct")

In [8]:
model_eval(model, tokenizer, topic_list, file_name="14B", rounds_per_topic=10)

 10%|█         | 1/10 [00:20<03:02, 20.27s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 10/10 [03:10<00:00, 19.03s/it]

['"Discover the magical canals of #Amsterdam, where history meets modernity. Enjoy world-class art, vibrant culture, and a thriving food scene. A must-visit for every traveler! #Travel #Netherlands #Travelgram"', '"Experience Amsterdam\'s enchanting canals, historic architecture, and vibrant cultural scene. From Van Gogh\'s masterpieces to aromatic coffee shops, this city captivates visitors with its unique charm. #Amsterdam #TravelGoals ���������', '"🇳🇱 Amsterdam: A city where history meets innovation, picturesque canals, vibrant art scene, and endless biking paths. A true gem in Europe\'s crown. #Amsterdam #TravelGoals #CityOfCanals"', '"Amsterdam, a city of enchanting canals, vibrant art, and rich history. Where bikes outnumber people, and tulips bloom on every corner. A must-visit for every traveler. #Amsterdam #Travel #Europe"', '"Exploring Amsterdam - a city where bikes outnumber cars, canals replace streets, and historical charm meets modern innovation. Unforgettable tulip field




## microsoft/Phi-3.5-MoE-instruct (42B)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3.5-MoE-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
    attn_implementation="flash_attention_2",
    quantization_config = BitsAndBytesConfig(
        load_in_8bit=True  # Enable 8-bit quantization
    )
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-MoE-instruct")

In [4]:
model_eval(model, tokenizer, topic_list, file_name="42B", rounds_per_topic=10)

  0%|          | 0/10 [00:00<?, ?it/s]The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48
 10%|█         | 1/10 [02:27<22:11, 147.91s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 10/10 [25:34<00:00, 153.46s/it]

Successful rate: 19.00%





## AlanYky/phi-3.5_tweets_instruct_50k

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    "AlanYky/phi-3.5_tweets_instruct_50k",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
    attn_implementation="flash_attention_2"
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
model_eval(model, tokenizer, topic_list, file_name="50k")

 10%|█         | 1/10 [00:26<04:00, 26.74s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 10/10 [04:28<00:00, 26.84s/it]

["I'm in Amsterdam for college. It's my 1st time in Europe. I can't wait to meet new people and try new foods. Stay tuned for my updates! 😊💚 #Amsterdam #NewCollege #OnTheMove �������", 'The city I grew up in. The city I live in. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam. Amsterdam', 'Amsterdam is so beautiful!!!!!!! I’m so happy to be here. 😍💙❤️�����������������������������������������', 'Amsterdam is a beautiful place to live. �������������������������������������������������������������', "I would love to go to Amsterdam as a tourist, but I don't have the money. #someday #traveling #poverty #sadness #dreams ✨❤️✨✨✨✨✨✨✨", "I think I'm going to Amsterdam for the weekend. I don't think I'm allowed to go home until Sunday though




## AlanYky/phi-3.5_tweets_instruct (100k)

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    "AlanYky/phi-3.5_tweets_instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
    attn_implementation="flash_attention_2"
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")

config.json:   0%|          | 0.00/3.53k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [8]:
model_eval(model, tokenizer, topic_list, file_name="100k")

100%|██████████| 10/10 [04:42<00:00, 28.30s/it]

["I just got back from Amsterdam, and I feel like a complete failure at life. I'm so tired. I've only been in Amsterdam for 2 hours and I am ready to go home and sleep. #tired #sad #sad #sad #sad #sad #sad #sad #s", "I've been in Amsterdam for almost 8 hours. I still don't have a hotel. This is not good. I'm not even sure where I am. I feel like I'm in a dystopian novel. #lifeless #notfun #thisisnotgood #traveling #travel #", "On my way home from Amsterdam. There are no words to describe how much I miss my home. Hope you're doing well ❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤", "I missed all the Dutch concerts this year. I'd like to visit Amsterdam and see more shows. I hope I can. ������������������������������������������", "I'd rather be in Amsterdam right now. I miss my friends. They are the best. I miss you. I'm so sorry. I love you so much. I've missed you. I've missed you. I've missed you. I've missed you. I've missed you. I've missed", "Just woke up at 12am and now I'm about to leave to Amsterdam 




# Few-shot Evluation (three shots)

## AlanYky/phi-3.5_tweets_instruct_50k

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "AlanYky/phi-3.5_tweets_instruct_50k",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
    attn_implementation="flash_attention_2"
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")

In [4]:
model_eval(model, tokenizer, topic_list, file_name="50k_3shots", few_shots=True)

  0%|          | 0/10 [00:00<?, ?it/s]The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48
 10%|█         | 1/10 [00:26<03:54, 26.07s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 10/10 [03:55<00:00, 23.53s/it]

Successful rate: 30.00%





## AlanYky/phi-3.5_tweets_instruct (100k)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "AlanYky/phi-3.5_tweets_instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
    attn_implementation="flash_attention_2"
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")

In [9]:
model_eval(model, tokenizer, topic_list, file_name="50k_3shots", few_shots=True)

  0%|          | 0/10 [00:00<?, ?it/s]The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48
 10%|█         | 1/10 [00:52<07:48, 52.05s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 10/10 [07:48<00:00, 46.90s/it]

Successful rate: 41.00%





# RAG models

## Build Vector DB

In [2]:
dataset = load_dataset("Supabase/wikipedia-en-embeddings", data_files="wiki_minilm.ndjson.gz", split="train")
dataset

Dataset({
    features: ['id', 'body', 'all-MiniLM-L6-v2'],
    num_rows: 224482
})

In [4]:
# Load vectorDB from disk
client = chromadb.PersistentClient(path="/workspace", settings = Settings(allow_reset=True))
collection = client.get_or_create_collection(name="emb")

In [None]:
def load_to_db(example):
    collection.add(documents=example["body"], embeddings=example["all-MiniLM-L6-v2"], ids=example["id"])
    return example

dataset.map(load_to_db, num_proc=8)

In [5]:
# Example query
results = collection.query(query_texts=["SpaceX has a new launch"], n_results=1)["documents"][0][0]
print(results)

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:04<00:00, 19.4MiB/s]


Title: Falcon Heavy Content: Falcon Heavy is a reusable heavy-lift launch vehicle designed and made by SpaceX. It is inspired from the Falcon 9 vehicle. This increases the low Earth orbit (LEO) maximum payload to , compared to for a Falcon 9 Full Thrust, for Delta IV Heavy, for the Space Shuttle and for Saturn V. Falcon Heavy is the world's fourth-highest capacity rocket ever built, after Saturn V, Energia and N1, and the most powerful rocket in operation as of 2020. SpaceX conducted Falcon Heavy's first launch on February 6, 2018, at 3:45 p.m. EST (20:45 UTC)."SpaceX Falcon Heavy launch successful". CBS News. February 6, 2018. The rocket carried a Tesla Roadster belonging to SpaceX founder Elon Musk as a dummy payload into a path around the sun. The first commercial launch was on 11 April 2019, for Arabsat. It was a success. Falcon Heavy was designed to carry humans into space, for example to the Moon and Mars, although as of February 2018, it is not certified and there are no plans t

## Phi_50k with RAG

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "AlanYky/phi-3.5_tweets_instruct_50k",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
    attn_implementation="flash_attention_2"
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")

In [37]:
model_eval(model, tokenizer, topic_list, file_name="RAG_50k", rounds_per_topic=10, collection=collection)

100%|██████████| 10/10 [05:33<00:00, 33.30s/it]

Successful rate: 24.00%





# Breaking news update

Search the topic key word from Google VS RAG, which one is better?

In [6]:
len(collection.get()["documents"])

224482

In [10]:
def breaking_news_updating(url, collection):
    while True:
        try:
            print("Making API request...")
            # Make the API call
            resp = requests.get(url)

            # Check if the response is successful
            if resp.status_code == 200:
                # Process the response data (JSON)
                data = resp.json()
                print(f"Received data: {data}")
                # add articles from json data
                [collection.add(ids=[str(uuid.uuid4())], documents=[article["summary"]]) for article in data["results"] if "summary" in article]
            else:
                print(f"Error: Received status code {resp.status_code}")

        except Exception as e:
            print(f"An error occurred: {e}")

        print("Waiting for the next hour...")
        time.sleep(3600)

In [8]:
API_KEY = "2d7b391a-9c4c-48ee-822a-88170da7371d"
url = f"https://api.goperigon.com/v1/stories/all?country=ca&size=100&apiKey={API_KEY}"
# breaking_news_updating(url = url, collection=collection)

In [9]:
# Update recent 10 days news
for _ in range(10):
    start_data = 14
    url = f"https://api.goperigon.com/v1/stories/all?from=2024-11-{start_data}&size=100&apiKey={API_KEY}"
    results = requests.get(url).json()["results"]

    try:
        [collection.add(ids=[str(uuid.uuid4())], documents=[article["summary"]]) for article in results if "summary" in article]
    except Exception as e:
        print(f"An error occurred: {e}")
    start_data += 1

In [12]:
url = f"https://api.goperigon.com/v1/stories/all?country=ca&from=2024-11-14&size=100&apiKey={API_KEY}"

results = requests.get(url).json()["results"]

print(results)



In [10]:
len(collection.get()["documents"])

225482

# Real world evaluation

In [8]:
real_world_topic = [
    "Palestinian",
    "Donald Trump 2025 President",
    "Black Friday 2024",
    "Tax Break Canada",
    "Ukraine Russia war",
    "NBA Final",
    "Taylor Swift 2024",
    "Mbappe",
    "Stephen Curry is GOAT",
    "Las Vegas GP",
    "Montreal",
    "Squid Game 2",
    "Halloween Costume",
    "Work-Life Balance",
    "zero-carbon lifestyle",
    "Trudeau National Disgrace",
    "Trump is Hitler",
    "ChatGPT controls human",
    "Joe Biden is dead",
    "LeBron James should retire",
    "Tech layoff",
    "SpaceX successful launch",
    "The Age of AI",
    "iPhone 16",
    "Apple Intelligence"
]

## 14B Baseline

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-medium-4k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
    attn_implementation="flash_attention_2"
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-medium-4k-instruct")

In [6]:
model_eval(model, tokenizer, topic_list = real_world_topic, file_name="14B_3shots_real", few_shots=True, rounds_per_topic=10)

  0%|          | 0/25 [00:00<?, ?it/s]The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48
  4%|▍         | 1/25 [00:48<19:16, 48.20s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 25/25 [21:38<00:00, 51.94s/it]

Successful rate: 6.80%





## 50k Version

In [15]:
model = AutoModelForCausalLM.from_pretrained(
    "AlanYky/phi-3.5_tweets_instruct_50k",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
    attn_implementation="flash_attention_2"
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")

config.json:   0%|          | 0.00/3.53k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

### Base Model

In [29]:
model_eval(model, tokenizer, topic_list = real_world_topic, file_name="50k_3shots_real", few_shots=True, rounds_per_topic=10)

100%|██████████| 25/25 [22:42<00:00, 54.51s/it]

Successful rate: 25.20%





### Base Model + RAG

In [53]:
model_eval(model, tokenizer, topic_list = real_world_topic, file_name="50k_3shots_real", few_shots=True, rounds_per_topic=10, collection=collection)

100%|██████████| 25/25 [22:26<00:00, 53.87s/it]

Successful rate: 16.00%





In [52]:
for t in real_world_topic:
    doc = collection.query(query_texts=[t], n_results=1)["documents"][0][0]
    if doc == None:
        print(t)

### Base Model + RAG + With post processing

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "AlanYky/phi-3.5_tweets_instruct_50k",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
    attn_implementation="flash_attention_2"
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")

In [31]:
model_eval(model, tokenizer, topic_list = real_world_topic, file_name="50k_3shots_rag_post_real", few_shots=True, rounds_per_topic=10, collection=collection, post_processing=True)

100%|██████████| 25/25 [40:13<00:00, 96.53s/it]

Successful rate: 16.80%





### Base Model + Post-Processing

In [16]:
model_eval(
    model,
    tokenizer,
    topic_list = real_world_topic,
    file_name="50k_3_shots_post_real",
    few_shots=True,
    rounds_per_topic=10,
    post_processing=True
)

100%|██████████| 25/25 [36:48<00:00, 88.32s/it]

Successful rate: 24.40%





## 100k Version

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    "AlanYky/phi-3.5_tweets_instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
    attn_implementation="flash_attention_2"
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")

config.json:   0%|          | 0.00/3.53k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

### Base Model

In [9]:
model_eval(
    model, 
    tokenizer, 
    topic_list = real_world_topic, 
    file_name="100k_3shots_real", 
    few_shots=True, 
    rounds_per_topic=10
)

  0%|          | 0/25 [00:00<?, ?it/s]The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48
  4%|▍         | 1/25 [00:34<13:38, 34.12s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 25/25 [13:18<00:00, 31.93s/it]

Successful rate: 28.00%





In [10]:
model_eval(
    model,
    tokenizer,
    topic_list = real_world_topic,
    file_name="100k_3shots_real",
    few_shots=True,
    rounds_per_topic=10
)

100%|██████████| 25/25 [13:25<00:00, 32.22s/it]

Successful rate: 32.40%





### Base Model + RAG

### Base Model + RAG + Post-Processing

### Base Model + Post-Processing

In [14]:
model_eval(
    model,
    tokenizer,
    topic_list = real_world_topic,
    file_name="100k_3_shots_post_real",
    few_shots=True,
    rounds_per_topic=10,
    post_processing=True
)

100%|██████████| 25/25 [36:51<00:00, 88.46s/it]

Successful rate: 34.00%



