In [None]:
%%time
!pip install transformers datasets accelerate peft
#fsspec==2024.10.0
!pip install sympy==1.13.3
#pip install tiktoken
#pip install triton


In [None]:

#load model
%%time
#https://huggingface.co/microsoft/Phi-3.5-mini-instruct
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "microsoft/Phi-3.5-mini-instruct"
#"microsoft/Phi-3-small-8k-instruct" #8B
#"microsoft/Phi-3.5-mini-instruct" #"microsoft/phi-2"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:

# Correct some special tokens
tokenizer.pad_token = tokenizer.eos_token  # Set pad_token to eos_token value for padding
tokenizer.eos_token = '<|eot_id|>'  # Set the end of sequence token

#set cuda for GPU
import torch
torch.set_default_device("cuda")


In [None]:
#prepare data 
import json
import csv
import pandas as pd
with open('no_burnout.json', 'r') as json_file:
    data = json.load(json_file)

# If the JSON data is a list of dictionaries, we can directly convert it to a DataFrame
if isinstance(data, list):
    df_no = pd.DataFrame(data)

with open('clean_burnout.json', 'r') as json_file:
    data = json.load(json_file)

# If the JSON data is a list of dictionaries, we can directly convert it to a DataFrame
if isinstance(data, list):
    df_yes = pd.DataFrame(data)

df_no['label'] = 'nonburnedout'
df_yes['label'] = 'burnedout' 

#combine and shuffle 
df_all = pd.concat([df_no, df_yes]).sample(frac=1, random_state=42, replace=False).reset_index(drop=True)

In [None]:
#SMOTE
"""
from collections import Counter
from imblearn.over_sampling import SMOTE

X = df_all['body']
y = df_all['label']
print('Original dataset shape %s' % Counter(y))
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)
print('Smote dataset shape %s' % Counter(y))
"""

In [None]:

#pipeline for model
%%time
from transformers import pipeline
import time

start_time = time.time()

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer = tokenizer,
    max_new_tokens= 100,
    #max_length = 1000,
    truncation = True,
    #padding = True,
    #temperature = 0.1, #not needed b/c do_sample=False meaning it's deterministic
    #top_k = 50,
    #top_p = 0.95,
    do_sample=False,
    device_map="auto",
    trust_remote_code=True,
)
end_time = time.time()
print("Elapsed time:", end_time - start_time, "seconds")

In [None]:
import re

def clean_text(text):
  # Regex pattern to find the last occurrence of "output: " and extract until the next '}'
  pattern = r'\'output\': (.*?)\n'#r'output: (.*?)\}'

  # Find all occurrences of the pattern
  matches = re.findall(pattern, text)

  # Get the last match (if any)
  if matches:
      last_match = str(matches[-1])
      cleaned_text = re.sub(r'[^a-zA-Z]', '', last_match)
      return cleaned_text
  else:
      return "Not found"
  

In [None]:

%%time

all_responses_original = []
all_responses = []


from tqdm import tqdm

for index, post in tqdm(df_all.iterrows(), total=df_all.shape[0]):
  prompt = f"""
  The input is a post a person shared on the social media.
  The output is a binary classification of this person being 'burned out' or 'non burned out'.
  The task is provide the output based on the input.
  Please provide the output with only the following based on the input text, and DO NOT iterate the input or give additional text beyond the output in your response:
    - {{'output': 'burned out' or 'non burned out'}}

  Here is the input: {post['title'] + " " + post['body']}
  """
    #inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)
  output = pipe(prompt)
  response = clean_text(output[0]['generated_text'])
  all_responses.append(response)
  #not enough memory to save all these for long runs
  all_responses_original.append(output[0]['generated_text'])

In [None]:
# Open the CSV file for writing
df_x = df_all
df_x['predictions'] = all_responses
df_x['full generated text'] = all_responses_original
df_x
df_x.to_csv('pred_all_wTitle.csv', index=False)

""" If running the two dataframes separately
with open('pred_no_burnout_clean.csv', mode='w', newline='') as file:
    writer = csv.writer(file)

    # Write each item from the list as a row
    for item in all_responses:
        writer.writerow([item])  # Write the item as a row (wrapped in a list)

with open('pred_no_burnout_original.csv', mode='w', newline='') as file:
    writer = csv.writer(file)

    # Write each item from the list as a row
    for item in all_responses_original:
        writer.writerow([item])  # Write the item as a row (wrapped in a list)

"""
print("output SAVED results to runtime, REMEMBER TO DOWNLOAD TO LOCAL MACHINE")

### SAVED results to runtime, REMEMBER TO DOWNLOAD TO LOCAL MACHINE