In [None]:
# transformer with phi3 is only available in latest version
!pip uninstall -y transformers && pip install git+https://github.com/huggingface/transformers

In [None]:
!pip install "pandas<2.0.0"

##  Import Packages

In [None]:
import pandas
import torch
import accelerate
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from datasets import Dataset, load_dataset

In [None]:
# Setting torch seed
torch.random.manual_seed(0)

## Defining model & tokenizers

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

## Phi-3 Demo

In [None]:
messages = [
    {
        "role": "system",
        "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.",
    },
    {
        "role": "user",
        "content": "Can you provide ways to eat combinations of bananas and dragonfruits?",
    },
    {
        "role": "assistant",
        "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey.",
    },
    {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
]

In [None]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

output = pipe(messages, **generation_args)
print(output[0]["generated_text"])

## Using Phi-3 for ABSA

In [None]:
aspects = ["food", "service", "ambience", "price", "anecdotes/miscellaneous"]
sentiments = ["positive", "neutral", "negative"]

In [None]:
messages = [
    {
        "role": "system",
        "content": f"Your task is to analyse and perform aspect based sentiment analysis on the given restaurant reviews text. The aspects should only be {aspects} and sentiments should only be {sentiments}. Output should only contain mentioned aspects and their respective sentiments as a dictionary. Each aspects should only have one sentiment and not every aspect is necessarily present. Do not provide any further explanation.",
    },
    {
        "role": "user",
        "content": "Add to that great service and great food at a reasonable price and you have yourself the beginning of a great evening",
    },
    {
        "role": "assistant",
        "content": "{'service':'positive','food':'positive','price':'positive'}",
    },
    {
        "role": "user",
        "content": "I can understand the prices if it served better food, like some Chinese restaurants in midtown/uptown area.",
    },
]

In [None]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

output = pipe(messages, **generation_args)



 {'food':'negative','price':'neutral'}


In [None]:
print(output[0]["generated_text"])

## Loading data

In [None]:
# Loading test data
dataset = load_dataset("arrow", data_files="test_dataset.hf")
dataset[0]

## Zero shot

In [None]:
aspects = ["food", "service", "ambience", "price", "anecdotes/miscellaneous"]
sentiments = ["positive", "neutral", "negative"]


def zero_shot_classify(data):
    result = pipe(
        [
            [
                {
                    "role": "system",
                    "content": f"Your task is to analyse and perform aspect based sentiment analysis on the given restaurant reviews text. The aspects should only be {aspects} and sentiments should only be {sentiments}. Output should only contain mentioned aspects and their respective sentiments as a dictionary. Each aspects should only have one sentiment and not every aspect is necessarily present. Do not provide any further explanation.",
                },
                {"role": "user", "content": x},
            ]
            for x in data["text"]
        ],
        **generation_args,
    )
    output = [_res[0]["generated_text"] for _res in result]
    try:
        _output = [eval(o) for o in output]
    except Exception as e:
        print(f"Output not a valid json. \nException: {e} \nOutput: {output}")
        print("\n")
        _output = [{} for o in output]

    return {"zero_shot_result": _output}

In [None]:
"""messages = [
    {"role": "system", "content": f'Your task is to analyse and perform aspect based sentiment analysis on the given restaurant reviews text. The aspects should only be {aspects} and sentiments should only be {sentiments}. Output should only contain mentioned aspects and their respective sentiments as a dictionary. Each aspects should only have one sentiment and not every aspect is necessarily present. Do not provide any further explanation.'},
    {"role": "user", "content": 'Add to that great service and great food at a reasonable price and you have yourself the beginning of a great evening'},
    ]"""

'messages = [\n    {"role": "system", "content": f\'Your task is to analyse and perform aspect based sentiment analysis on the given restaurant reviews text. The aspects should only be {aspects} and sentiments should only be {sentiments}. Output should only contain mentioned aspects and their respective sentiments as a dictionary. Each aspects should only have one sentiment and not every aspect is necessarily present. Do not provide any further explanation.\'},\n    {"role": "user", "content": \'Add to that great service and great food at a reasonable price and you have yourself the beginning of a great evening\'},\n    ]'

In [None]:
dataset_zs = dataset.map(
    lambda _rows: zero_shot_classify(_rows), batched=True, batch_size=10
)

Map:   0%|          | 0/749 [00:00<?, ? examples/s]



Output not a valid json. 
Exception: unterminated string literal (detected at line 1) (<string>, line 1) 
Output: [" Certainly! Here's a well-crafted bread recipe that you can try:\n\n**Ingredients:**\n\n- 1 cup warm water (110°F/45°C)\n- 2 tablespoons white sugar\n- 2 1/4 teaspoons active dry yeast (one packet)\n- 1 teaspoon salt\n- 3 1/2 to 4 cups bread flour\n- 2 tablespoons unsalted butter, melted\n- 1 egg yolk\n- 1 tablespoon whole milk\n- Cornmeal or flour for dusting\n\n**Instructions:**\n\n1. In a large bowl, dissolve the sugar in warm water and stir in yeast. Let it stand until it becomes frothy, about 5 minutes.\n\n2. Add salt and 3 cups of flour to the yeast mixture. Beat until smooth.\n\n3. Gradually add the remaining flour, 1/2 cup at a time, until the dough is soft and slightly sticky.\n\n4. Turn the dough out onto a floured surface and knead for about 10 minutes, until it becomes smooth and elastic.\n\n5. Place the dough in a greased bowl, turning it to coat with oil. Co

KeyboardInterrupt: 

In [None]:
dataset_zs.to_pandas().to_csv(
    "/content/drive/MyDrive/Scientific_project/thesis/results/phi3_zero_shot_df.csv"
)

### One shot

In [None]:
aspects = ["food", "service", "ambience", "price", "anecdotes/miscellaneous"]
sentiments = ["positive", "neutral", "negative"]


def one_shot_classify(data):
    result = pipe(
        [
            [
                {
                    "role": "system",
                    "content": f"Your task is to analyse and perform aspect based sentiment analysis on the given restaurant reviews text. The aspects should only be {aspects} and sentiments should only be {sentiments}. Output should only contain mentioned aspects and their respective sentiments as a dictionary. Each aspects should only have one sentiment and not every aspect is necessarily present. Do not provide any further explanation.",
                },
                {
                    "role": "user",
                    "content": "Add to that great service and great food at a reasonable price and you have yourself the beginning of a great evening",
                },
                {
                    "role": "assistant",
                    "content": "{'service':'positive','food':'positive','price':'positive'}",
                },
                {"role": "user", "content": x},
            ]
            for x in data["text"]
        ],
        **generation_args,
    )
    output = [_res[0]["generated_text"] for _res in result]

    #   for o in output:
    #       try:
    #           _output = eval(o.strip())
    #           parsed_output.append(_output)
    #       except Exception as e:
    #           print(f"Output not a valid json. \nException: {e} \nOutput: {o}")
    #           parsed_output.append({})
    #       print("\n")
    return {"one_shot_result": output}

In [None]:
dataset_os = dataset.map(
    lambda _rows: one_shot_classify(_rows), batched=True, batch_size=25
)

Map:   0%|          | 0/749 [00:00<?, ? examples/s]



In [None]:
dataset_os.to_pandas().to_csv(
    "/content/drive/MyDrive/Scientific_project/thesis/results/phi3_one_shot_df.csv"
)

## Two Shot

In [None]:
aspects = ["food", "service", "ambience", "price", "anecdotes/miscellaneous"]
sentiments = ["positive", "neutral", "negative"]


def two_shot_classify(data):
    result = pipe(
        [
            [
                {
                    "role": "system",
                    "content": f"Your task is to analyse and perform aspect based sentiment analysis on the given restaurant reviews text. The aspects should only be {aspects} and sentiments should only be {sentiments}. Output should only contain mentioned aspects and their respective sentiments as a dictionary. Each aspects should only have one sentiment and not every aspect is necessarily present. Do not provide any further explanation.",
                },
                {
                    "role": "user",
                    "content": "Add to that great service and great food at a reasonable price and you have yourself the beginning of a great evening",
                },
                {
                    "role": "assistant",
                    "content": "{'service':'positive','food':'positive','price':'positive'}",
                },
                {
                    "role": "user",
                    "content": "I can understand the prices if it served better food, like some Chinese restaurants in midtown/uptown area.",
                },
                {
                    "role": "assistant",
                    "content": "{'price':'negative','food':'negative'}",
                },
                {"role": "user", "content": x},
            ]
            for x in data["text"]
        ],
        **generation_args,
    )
    output = [_res[0]["generated_text"] for _res in result]

    #   for o in output:
    #       try:
    #           _output = eval(o.strip())
    #           parsed_output.append(_output)
    #       except Exception as e:
    #           print(f"Output not a valid json. \nException: {e} \nOutput: {o}")
    #           parsed_output.append({})
    #       print("\n")
    return {"two_shot_result": output}

In [None]:
dataset_ts = dataset.map(
    lambda _rows: two_shot_classify(_rows), batched=True, batch_size=50
)

Map:   0%|          | 0/749 [00:00<?, ? examples/s]



In [None]:
dataset_ts.to_pandas().to_csv(
    "/content/drive/MyDrive/Scientific_project/thesis/results/phi3_two_shot_df.csv"
)

## Three shot

In [None]:
aspects = ["food", "service", "ambience", "price", "anecdotes/miscellaneous"]
sentiments = ["positive", "neutral", "negative"]


def three_shot_classify(data):
    result = pipe(
        [
            [
                {
                    "role": "system",
                    "content": f"Your task is to analyse and perform aspect based sentiment analysis on the given restaurant reviews text. The aspects should only be {aspects} and sentiments should only be {sentiments}. Output should only contain mentioned aspects and their respective sentiments as a dictionary. Each aspects should only have one sentiment and not every aspect is necessarily present. Do not provide any further explanation.",
                },
                {
                    "role": "user",
                    "content": "Add to that great service and great food at a reasonable price and you have yourself the beginning of a great evening",
                },
                {
                    "role": "assistant",
                    "content": "{'service':'positive','food':'positive','price':'positive'}",
                },
                {
                    "role": "user",
                    "content": "I can understand the prices if it served better food, like some Chinese restaurants in midtown/uptown area.",
                },
                {
                    "role": "assistant",
                    "content": "{'price':'negative','food':'negative'}",
                },
                {
                    "role": "user",
                    "content": "I would definitely go back -- if only for some of those exotic martinis on the blackboard.",
                },
                {"role": "assistant", "content": "{'food':'positive'}"},
                {"role": "user", "content": x},
            ]
            for x in data["text"]
        ],
        **generation_args,
    )
    output = [_res[0]["generated_text"] for _res in result]

    #   for o in output:
    #       try:
    #           _output = eval(o.strip())
    #           parsed_output.append(_output)
    #       except Exception as e:
    #           print(f"Output not a valid json. \nException: {e} \nOutput: {o}")
    #           parsed_output.append({})
    #       print("\n")
    return {"three_shot_result": output}

In [None]:
dataset_3s = dataset.map(
    lambda _rows: three_shot_classify(_rows), batched=True, batch_size=100
)

Map:   0%|          | 0/749 [00:00<?, ? examples/s]



In [None]:
dataset_3s.to_pandas().to_csv(
    "/content/drive/MyDrive/Scientific_project/thesis/results/phi3_three_shot_df.csv"
)

## Five shot

In [None]:
aspects = ["food", "service", "ambience", "price", "anecdotes/miscellaneous"]
sentiments = ["positive", "neutral", "negative"]


def five_shot_classify(data):
    result = pipe(
        [
            [
                {
                    "role": "system",
                    "content": f"Your task is to analyse and perform aspect based sentiment analysis on the given restaurant reviews text. The aspects should only be {aspects} and sentiments should only be {sentiments}. Output should only contain mentioned aspects and their respective sentiments as a dictionary. Each aspects should only have one sentiment and not every aspect is necessarily present. Do not provide any further explanation.",
                },
                {
                    "role": "user",
                    "content": "Add to that great service and great food at a reasonable price and you have yourself the beginning of a great evening",
                },
                {
                    "role": "assistant",
                    "content": "{'service':'positive','food':'positive','price':'positive'}",
                },
                {
                    "role": "user",
                    "content": "I can understand the prices if it served better food, like some Chinese restaurants in midtown/uptown area.",
                },
                {
                    "role": "assistant",
                    "content": "{'price':'negative','food':'negative'}",
                },
                {
                    "role": "user",
                    "content": "I would definitely go back -- if only for some of those exotic martinis on the blackboard.",
                },
                {"role": "assistant", "content": "{'food':'positive'}"},
                {
                    "role": "user",
                    "content": "Definitely a great spot for a nice occasion or date.",
                },
                {
                    "role": "assistant",
                    "content": "{'anecdotes/miscellaneous':'neutral'}",
                },
                {
                    "role": "user",
                    "content": "The food was terrific and the service classy, attentive, without being overbearing.",
                },
                {
                    "role": "assistant",
                    "content": "{'food':'positive','service':'positive'}",
                },
                {"role": "user", "content": x},
            ]
            for x in data["text"]
        ],
        **generation_args,
    )
    output = [_res[0]["generated_text"] for _res in result]

    #   for o in output:
    #       try:
    #           _output = eval(o.strip())
    #           parsed_output.append(_output)
    #       except Exception as e:
    #           print(f"Output not a valid json. \nException: {e} \nOutput: {o}")
    #           parsed_output.append({})
    #       print("\n")
    return {"five_shot_result": output}

In [None]:
dataset_5s = dataset.map(
    lambda _rows: three_shot_classify(_rows), batched=True, batch_size=100
)

Map:   0%|          | 0/749 [00:00<?, ? examples/s]



In [None]:
dataset_5s.to_pandas().to_csv(
    "/content/drive/MyDrive/Scientific_project/thesis/results/phi3_five_shot_df.csv"
)