In [2]:
import os

# CHANGE WORKING DIRECTORY TO ROOT
current_dir = os.path.basename(os.getcwd())
if current_dir == "src":
    os.chdir("..") # Move up by 1
elif os.path.basename(os.getcwd()) == "bai-thesis-nlp":  
    pass # If already at root, stay there
else:
    os.chdir("../..") # Move up by 2 otherwise
    
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import pandas as pd
import re
from src._utils._helpers import log_synthetic_data, response2json, get_response, set_seed, clear_cuda_cache

In [3]:
# Create the folder to save the synthetic data
folder_name = "synthetic_data/logs"
os.makedirs(folder_name, exist_ok=True)

# file where the logs will be saved
log_file_path = folder_name + "/agnews_log.json"
RECREATE_LOG = False
if os.path.exists(log_file_path) and RECREATE_LOG:
    os.remove(log_file_path) # recreate from scratch

# DEVICE
device = 'cuda:0'

# DATA
df = pd.read_csv("real_data/train/agnewstrainAll.csv")
df = df.rename(columns={"2": "text", "3": "label"})
display(df.head())

labels_lst = df['label'].unique()
labels_str = ", ".join(labels_lst)
labels_str_bullet = "\n".join([f"- **{name}**" for name in labels_lst])
print(f"Labels: {labels_str}")

# Print the first example for each label
examples = []
for label in labels_lst:
    example = (df[df['label'] == label].iloc[0]).loc['text']
    examples.append(example)
    print(f"Label: {label}\nText: {example}\n")

Unnamed: 0,0,1,text,label
0,1000_train,Hartford executive #39;s stock sale probed,New York AG is investigating the timing of a s...,Business
1,1001_train,Cool Batman Photo!,"In related news, it was announced yesterday th...",Sci/Tech
2,1002_train,Stocky Monkey in Himalayas Becomes Newest Prim...,Scientists from India working in the Himalayas...,Sci/Tech
3,1003_train,"Bush, Kerry Don''t Worry About Tech","Election Day is less than two weeks away, but ...",Sci/Tech
4,1004_train,The Teeming Crowd in Video Games,"Jeffrey Griffiths, the president and chief exe...",Sci/Tech


Labels: Business, Sci/Tech, Sports, World
Label: Business
Text: New York AG is investigating the timing of a sale by Thomas Marra; company reports higher 3Q. NEW YORK (Reuters) - The Hartford Financial Services Group Inc.

Label: Sci/Tech
Text: In related news, it was announced yesterday that Warner Bros. Interactive Entertainment, DC Comics and Electronic Arts will bring a Batman Begins videogame tie-in.

Label: Sports
Text: Lindsay Davenport #39;s world number one ranking is in doubt after she failed to make the finals of the WTA Tour Championships in Los Angeles.

Label: World
Text:  JERUSALEM (Reuters) - Israeli Prime Minister Ariel Sharon  accused far-rightists Sunday of trying to incite civil war over  his plan to withdraw from the occupied Gaza Strip and called  for measures to curb such groups.



---

---

In [4]:
from src._utils._generate_dataset import generate_synthetic_data

In [25]:
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="cuda",
    attn_implementation="flash_attention_2",
    quantization_config=quantization_config,  # load in 4-bit quantization
    # if I want to add other model parameters, I can add them here
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.generation_config.pad_token_id = tokenizer.pad_token_id

print(model.generation_config.to_dict())

{'max_length': 20, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'early_stopping': False, 'max_time': None, 'stop_strings': None, 'do_sample': True, 'num_beams': 1, 'num_beam_groups': 1, 'penalty_alpha': None, 'dola_layers': None, 'use_cache': True, 'cache_implementation': None, 'cache_config': None, 'return_legacy_cache': None, 'temperature': 0.6, 'top_k': 50, 'top_p': 0.95, 'min_p': None, 'typical_p': 1.0, 'epsilon_cutoff': 0.0, 'eta_cutoff': 0.0, 'diversity_penalty': 0.0, 'repetition_penalty': 1.0, 'encoder_repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'bad_words_ids': None, 'force_words_ids': None, 'renormalize_logits': False, 'constraints': None, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'forced_decoder_ids': None, 'sequence_bias': None, 'token_healing': False, 'guidance_scale': None, 'low

In [26]:
if hasattr(model.config, "quantization_config"):
    print(model.config.quantization_config.to_diff_dict())

else:
    print("Model does not have quantization_config")


{'_load_in_4bit': True, 'load_in_4bit': True}


---

In [None]:
res = model.config.quantization_config.to_diff_dict() \
        if hasattr(model.config, "quantization_config") else None
print(res)

{'_load_in_4bit': True, 'load_in_4bit': True}
