In [1]:
!pip install openpyxl
import pandas as pd

df = pd.read_excel('cleaned_evaluation_keyphrases_200US.xlsx')

reviews = df.iloc[0:, 0].tolist()

reviews[0]



'Love Kallax. These are the best storage cubbies. I love them. Have them in my closets with the Drona boxes. My husband has some in his train room. They are a great staple to have.'

In [2]:
!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /var/tmp/pip-req-build-wxw7n6ys
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /var/tmp/pip-req-build-wxw7n6ys
  Resolved https://github.com/huggingface/transformers to commit 7c71b61daef4da1ec5dc41674718d895176af3be
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [3]:
!pip install accelerate

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)


base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token=True,
    trust_remote_code=True,
)



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
from peft import PeftModel

model = PeftModel.from_pretrained(base_model, "mistral-new-finetune-400/checkpoint-1000")

In [5]:
import time

start_time = time.time()
device = "cuda"
answers = []
pre_text = 500 + 1221
eos_token_id = tokenizer.convert_tokens_to_ids('.')
eos_token_id_additional = tokenizer.convert_tokens_to_ids('<')

for review in reviews: 
    review_len = len(review)
    
    messages = [
        {"role": "user", "content": "From this sentence: 'So soft. This mat is amazing, it’s so so soft to stand on, we are loving it' generate a keyphrase for each key characteristic of the product and classify the sentiment of each generated keyphrase between positive, negative or neutral. Don't give any explainations, the output should only be Keyphrase 1 - Sentiment 1, Keyphrase 2 - Sentiment 2, ..."},
        {"role": "assistant", "content": "Soft to stand on - Positive, amazing mat - Positive"},
        {"role": "user", "content": "From this sentence: 'Kallax. Nice clean look, sturdy and easy to put together' generate a keyphrase for each key characteristic of the product and classify the sentiment of each generated keyphrase between positive, negative or neutral. Don't give any explainations, the output should only be Keyphrase 1 - Sentiment 1, Keyphrase 2 - Sentiment 2, ..."},
        {"role": "assistant", "content": "Clean look - Positive, sturdy - Positive, easy to assemble - Positive"},
        {"role": "user", "content": "From this sentence: 'Great bookshelves. We bought 4 of these and made a library wall with them. Looks great and holds lots of books.' generate a keyphrase for each key characteristic of the product and classify the sentiment of each generated keyphrase between positive, negative or neutral. Don't give any explainations, the output should only be Keyphrase 1 - Sentiment 1, Keyphrase 2 - Sentiment 2, ..."},
        {"role": "assistant", "content": "Great bookshelves - Positive, Looks great - Positive, made library wall - Neutral, capacious - Positive."},
        {"role": "user", "content": f"From this sentence: '{review}' generate a keyphrase for each key characteristic of the product and classify the sentiment of each generated keyphrase between positive, negative or neutral. Don't give any explainations, the output should only be Keyphrase 1 - Sentiment 1, Keyphrase 2 - Sentiment 2, ..."},
    ]
    
    
    encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

    model_inputs = encodeds.to(device)

    generated_ids = model.generate(model_inputs, max_new_tokens=75, pad_token_id=eos_token_id, eos_token_id=eos_token_id, do_sample=True)
    decoded = tokenizer.batch_decode(generated_ids)
    
    
    answer = decoded[0][pre_text+review_len:]
    
    #if eos_token_id_additional != -1:
    #    answer = answer.split('<')[0]
        
    answers.append(answer)

    if (answers.index(answer) + 1) % 20 == 0: 
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"{answers.index(answer) + 1}th file completed in {elapsed_time} seconds.")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


20th file completed in 187.16180872917175 seconds.
40th file completed in 344.5018811225891 seconds.
60th file completed in 489.290239572525 seconds.
80th file completed in 668.565390586853 seconds.
100th file completed in 849.5190889835358 seconds.
120th file completed in 996.661926984787 seconds.
140th file completed in 1145.7005922794342 seconds.
160th file completed in 1364.4608039855957 seconds.
180th file completed in 1527.9451723098755 seconds.
200th file completed in 1685.4833595752716 seconds.


In [6]:
reviews = [review.replace('\n', ' ') for review in reviews]

In [7]:
import csv

filename = 'Mistral-finetuned.csv'

# Writing to the csv file
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
    # Creating a csv writer object with quoting set to quote all fields
    csvwriter = csv.writer(csvfile, delimiter=';')
    
    # Writing the columns
    csvwriter.writerow(['Reviews', 'Keyphrases'])
    
    # Writing the data
    for review, answer in zip(reviews, answers):
        csvwriter.writerow([review, answer])

print(f'CSV file "{filename}" has been written successfully.')

CSV file "Mistral-finetuned.csv" has been written successfully.
