# Preparing Data for finetuning Llama 2


### Installing and Importing Libraries

In [None]:
%%capture
!pip install transformers==4.41.2 datasets==2.19.2

In [None]:
import transformers
transformers.__version__

'4.41.2'

In [None]:
import datasets
datasets.__version__

'2.19.2'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from datasets import Dataset
from transformers import AutoTokenizer
import pandas as pd

### Loading the Data

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Fine tuning Llama/Laptops_Train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Fine tuning Llama/Laptops_Test.csv')

## Preparing the Dataset

In [None]:
import ast

In [None]:
# Processing Test set
test_final_prompts = []
for raw_text, aspect in zip(test_df['raw_text'], test_df['aspectTerms']):
    user_dict = {"role":"user"}
    assistant_dict = {"role":"assistant"}
    defination = 'Identify the aspects (both implicit and explicit) and the aspects sentiment polarity only. In cases where there are no aspects the output should be noaspectterm:none.'
    example1 = 'Positive example -\ninput: I charge it at night and skip taking the cord with me because of the good battery life.\noutput: battery life:positive, '
    example2 = 'Neutral example -\ninput: Nightly my computer defrags itself and runs a virus scan.\noutput: virus scan:neutral'
    instruct = 'Now complete the following example-\ninput:'

    user_content = f'{defination}\n{example1}\n{example2}\n{instruct}{raw_text}'

    user_dict.update({"content":user_content})

    aspect = ast.literal_eval(aspect)

    s = ''

    for i,j in enumerate(aspect):
        if i == len(aspect)-1:
            s =  s +  j['term'] + ':' + j['polarity']
        else:
            s = s+ j['term'] + ':' + j['polarity']+', '

    assistant_content = f'output: {s}'
    assistant_dict.update({"content":assistant_content})

    test_final_prompts.append([user_dict, assistant_dict])

In [None]:
print(test_final_prompts[0])

[{'role': 'user', 'content': 'Identify the aspects (both implicit and explicit) and the aspects sentiment polarity only. In cases where there are no aspects the output should be noaspectterm:none.\nPositive example -\ninput: I charge it at night and skip taking the cord with me because of the good battery life.\noutput: battery life:positive, \nNeutral example -\ninput: Nightly my computer defrags itself and runs a virus scan.\noutput: virus scan:neutral\nNow complete the following example-\ninput:Boot time is super fast, around anywhere from 35 seconds to 1 minute.'}, {'role': 'assistant', 'content': 'output: Boot time:positive'}]


In [None]:
# Processing Train set
train_final_prompts = []
for raw_text, aspect in zip(train_df['raw_text'], train_df['aspectTerms']):
    user_dict = {"role":"user"}
    assistant_dict = {"role":"assistant"}
    defination = 'Identify the aspects (both implicit and explicit) and the aspects sentiment polarity only. In cases where there are no aspects the output should be noaspectterm:none.'
    example1 = 'Positive example -\ninput: I charge it at night and skip taking the cord with me because of the good battery life.\noutput: battery life:positive, '
    example2 = 'Neutral example -\ninput: Nightly my computer defrags itself and runs a virus scan.\noutput: virus scan:neutral'
    instruct = 'Now complete the following example-\ninput:'

    user_content = f'{defination}\n{example1}\n{example2}\n{instruct}{raw_text}'

    user_dict.update({"content":user_content})

    aspect = ast.literal_eval(aspect)

    s = ''

    for i,j in enumerate(aspect):
        if i == len(aspect)-1:
            s =  s +  j['term'] + ':' + j['polarity']
        else:
            s = s+ j['term'] + ':' + j['polarity']+', '

    assistant_content = f'output: {s}'
    assistant_dict.update({"content":assistant_content})

    train_final_prompts.append([user_dict, assistant_dict])

In [None]:
print(train_final_prompts[0])

[{'role': 'user', 'content': 'Identify the aspects (both implicit and explicit) and the aspects sentiment polarity only. In cases where there are no aspects the output should be noaspectterm:none.\nPositive example -\ninput: I charge it at night and skip taking the cord with me because of the good battery life.\noutput: battery life:positive, \nNeutral example -\ninput: Nightly my computer defrags itself and runs a virus scan.\noutput: virus scan:neutral\nNow complete the following example-\ninput:I charge it at night and skip taking the cord with me because of the good battery life.'}, {'role': 'assistant', 'content': 'output: cord:neutral, battery life:positive'}]


In [None]:
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
train_data = [tokenizer.apply_chat_template(i, tokenize=False, add_generation_prompt=False) for i in train_final_prompts]

In [None]:
print(train_data[0])

<s>[INST] Identify the aspects (both implicit and explicit) and the aspects sentiment polarity only. In cases where there are no aspects the output should be noaspectterm:none.
Positive example -
input: I charge it at night and skip taking the cord with me because of the good battery life.
output: battery life:positive, 
Neutral example -
input: Nightly my computer defrags itself and runs a virus scan.
output: virus scan:neutral
Now complete the following example-
input:I charge it at night and skip taking the cord with me because of the good battery life. [/INST] output: cord:neutral, battery life:positive </s>


In [None]:
test_data = [tokenizer.apply_chat_template(i, tokenize=False, add_generation_prompt=False) for i in test_final_prompts]

In [None]:
print(test_data[0])

<s>[INST] Identify the aspects (both implicit and explicit) and the aspects sentiment polarity only. In cases where there are no aspects the output should be noaspectterm:none.
Positive example -
input: I charge it at night and skip taking the cord with me because of the good battery life.
output: battery life:positive, 
Neutral example -
input: Nightly my computer defrags itself and runs a virus scan.
output: virus scan:neutral
Now complete the following example-
input:Boot time is super fast, around anywhere from 35 seconds to 1 minute. [/INST] output: Boot time:positive </s>


In [None]:
len(train_data), len(test_data )

(3045, 800)

In [None]:
pd.DataFrame(train_data).to_csv('/content/drive/MyDrive/Fine tuning Llama/train_prompts.csv')
pd.DataFrame(test_data).to_csv('/content/drive/MyDrive/Fine tuning Llama/test_prompts.csv')