In [1]:
!pip install transformers datasets accelerate peft bitsandbytes

Defaulting to user installation because normal site-packages is not writeable


[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [2]:
import torch
torch.cuda.empty_cache()
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base", force_download=True)
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", force_download=True)
model = model.to(device)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [28]:
import json
import pandas as pd
import re
import os
from datasets import Dataset
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset

# English to Esgish Data
train_excel_path = "translated_queries_TEST4.xlsx"

train_df = pd.read_excel(train_excel_path)
expected_columns = ["Esgish", "English"]
train_df.rename(columns=lambda x: x.strip(), inplace=True)

for col in expected_columns:
    assert col in train_df.columns, f"Missing expected column: {col}"
print("Training data loaded successfully.")
print(train_df.head())

train_data = []

for _, row in train_df.iterrows():
    train_data.append({
        "input": f"Convert to Esgish, a query language: {row['English']}",
        "output": row["Esgish"]
    })

train_df, temp_df = train_test_split(train_df, test_size=0.2, random_state=42)

val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
eval_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))


def preprocess_function(examples):
    inputs = tokenizer(examples["English"], truncation=True, padding="max_length", max_length=128)
    labels = tokenizer(examples["Esgish"], max_length=128, truncation=True, padding="max_length")    
    model_inputs = {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels["input_ids"], 
    }
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=["English", "Esgish"])
eval_dataset = eval_dataset.map(preprocess_function, batched=True, remove_columns=["English", "Esgish"])
test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=["English", "Esgish"])
print(train_dataset[0])

Training data loaded successfully.
                                           Esgish  \
0      [AbortifacientsDistrMaxRevRatio1Y] < '0.1'   
1     [AbortifacientsDistrMaxRevRatio1Y] > '0.01'   
2        [AbortifacientsDistrMaxRevRatio3Y] > '0'   
3  [AbortifacientsInvolvement] ANY 'Distribution'   
4    [AbortifacientsInvolvement] ANY 'Production'   

                                             English  
0  All companies or issuers where the maximum sha...  
1  All companies or issuers where the maximum sha...  
2  All companies that have a maximum share of rev...  
3  All companies or issuers that are involved in ...  
4  All companies or issuers that produce abortifa...  


Map:   0%|          | 0/388 [00:00<?, ? examples/s]

Map:   0%|          | 0/48 [00:00<?, ? examples/s]

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

{'input_ids': [863, 370, 3, 9, 570, 13, 66, 688, 42, 962, 52, 7, 24, 942, 8, 826, 6683, 10, 1300, 20534, 53, 2336, 3, 18, 2570, 115, 5, 21489, 1575, 41, 6210, 10, 37, 349, 65, 3, 9, 1516, 3134, 6275, 13, 44, 709, 3, 18, 4704, 12, 8, 10970, 13, 8, 5997, 96, 4302, 3473, 53, 387, 1280, 1682, 20534, 53, 2336, 3, 18, 2570, 115, 5, 21489, 1575, 41, 3174, 26, 5, 61, 10, 37, 349, 704, 494, 42, 364, 24, 43, 3, 9, 1516, 3134, 6275, 13, 44, 709, 3, 18, 4704, 12, 8, 10970, 13, 8, 5997, 96, 4302, 3473, 53, 387, 1280, 1877, 20534, 53, 2336, 3, 18, 2570, 115, 5, 4249, 7593, 15, 26, 41, 6210, 10, 37, 349, 65, 3, 9, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [40]:
#Esgish to English Data
train_excel_path = "translated_queries_TEST4.xlsx"

train_df = pd.read_excel(train_excel_path)
expected_columns = ["Esgish", "English"]
train_df.rename(columns=lambda x: x.strip(), inplace=True)

for col in expected_columns:
    assert col in train_df.columns, f"Missing expected column: {col}"
print("Training data loaded successfully.")
print(train_df.head())

train_data = []

for _, row in train_df.iterrows():
    train_data.append({
        "input": f"Convert to English: {row['Esgish']}",
        "output": row["English"]
    })

train_df, temp_df = train_test_split(train_df, test_size=0.2, random_state=42)

val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
eval_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))


def preprocess_function(examples):
    inputs = tokenizer(examples["Esgish"], truncation=True, padding="max_length", max_length=128)
    labels = tokenizer(examples["English"], max_length=128, truncation=True, padding="max_length")    
    model_inputs = {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels["input_ids"], 
    }
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=["Esgish", "English"])
eval_dataset = eval_dataset.map(preprocess_function, batched=True, remove_columns=["Esgish", "English"])
test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=["Esgish", "English"])
print(train_dataset[0])

Training data loaded successfully.
                                           Esgish  \
0      [AbortifacientsDistrMaxRevRatio1Y] < '0.1'   
1     [AbortifacientsDistrMaxRevRatio1Y] > '0.01'   
2        [AbortifacientsDistrMaxRevRatio3Y] > '0'   
3  [AbortifacientsInvolvement] ANY 'Distribution'   
4    [AbortifacientsInvolvement] ANY 'Production'   

                                             English  
0  All companies or issuers where the maximum sha...  
1  All companies or issuers where the maximum sha...  
2  All companies that have a maximum share of rev...  
3  All companies or issuers that are involved in ...  
4  All companies or issuers that produce abortifa...  


Map:   0%|          | 0/388 [00:00<?, ? examples/s]

Map:   0%|          | 0/48 [00:00<?, ? examples/s]

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

{'input_ids': [4674, 599, 6306, 134, 302, 5231, 40, 28632, 12988, 3728, 5890, 115, 4302, 17, 908, 2490, 2423, 3, 31, 2292, 31, 6, 6306, 134, 302, 5231, 40, 28632, 3174, 26, 5890, 115, 4302, 17, 908, 206, 195, 3, 31, 9, 31, 6, 6306, 134, 302, 5231, 40, 28632, 3174, 26, 5890, 115, 4302, 17, 908, 206, 195, 3, 31, 15, 31, 6, 6306, 134, 302, 5231, 40, 28632, 12988, 3728, 5890, 115, 667, 115, 7593, 908, 2490, 2423, 3, 31, 2292, 31, 6, 6306, 134, 302, 5231, 40, 28632, 3174, 26, 5890, 115, 667, 115, 7593, 908, 206, 195, 3, 31, 15, 31, 6, 6306, 134, 302, 5231, 40, 28632, 3174, 26, 5890, 115, 667, 115, 7593, 908, 206, 195, 3, 31, 9, 31, 6, 6306, 134, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [41]:
!pip install accelerate -U
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq, T5Tokenizer

training_args = TrainingArguments(
    output_dir="new_model",
    eval_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    push_to_hub=False,
    report_to="none",
    remove_unused_columns=False,
    load_best_model_at_end=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class = tokenizer,
    data_collator=data_collator,

)

print("Training started...")

trainer.train()
print("training done")


[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable
Training started...


Epoch,Training Loss,Validation Loss
1,No log,0.670527
2,No log,0.616724
3,No log,0.602195


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


training done


In [42]:
model.save_pretrained("./new_model2")
tokenizer.save_pretrained("./new_model2")

('./new_model2\\tokenizer_config.json',
 './new_model2\\special_tokens_map.json',
 './new_model2\\spiece.model',
 './new_model2\\added_tokens.json',
 './new_model2\\tokenizer.json')

In [22]:
#cleanup
del model
del tokenizer

import gc
gc.collect()

85

In [43]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained("./new_model2")
tokenizer = AutoTokenizer.from_pretrained("./new_model2")

In [49]:
import pandas as pd

#translate random queries
excel = pd.read_excel("translated_queries_TEST3_50q.xlsx")
sample = excel[["Esgish", "English"]].dropna().sample(5).values

for esg, eng in sample:
    input_text = "Convert to Esgish, a query language: " + eng
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    output_ids = model.generate(
        **inputs,
        max_length=1024,
        num_beams=4,
        early_stopping=True
    )
    print("Query: ", eng)
    print("Correct: ", esg)
    print("Model: ", tokenizer.decode(output_ids[0], skip_special_tokens=True), "\n\n")

Query:  All companies or issuers listed on the AFL-CIO Boycott List.
Correct:  [AFLCIOBoycottList] True
Model:  All companies or issuers listed on the AFL-CIO Boycott List. 


Query:  All companies or issuers that have an alcohol revenue share interval between 10% and 15%, or between 15% and 20%, or between 20% and 25%, or between 25% and 50%, or between 5% and 10%, or between 50% and 100%.
Correct:  [AlcoholRevShareInterval] IN '[10-15%)|[15-20%)|[20-25%)|[25-50%)|[5-10%)|[50-100%]'
Model:  All companies or issuers that have an alcohol revenue share interval between 10% and 15%, between 15% and 20%, between 20% and 25%, between 25% and 50%, or between 5% and 10%, or between 50% and 100%. 


Query:  All companies with a minimum alcohol distribution percentage of revenue greater than 50% in the latest financial year.
Correct:  [AlcoholDistributionMinRev] > '0.5'
Model:  All companies with a minimum alcohol distribution percentage of revenue greater than 50% in the latest financial year.

In [46]:
#translate esgish queries
excel = pd.read_excel("translated_queries_TEST4.xlsx")
sample = excel[["Esgish", "English"]].dropna().sample(5).values

for esg, eng in sample:
    input_text = "Convert to English: " + esg
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    output_ids = model.generate(
        **inputs,
        max_length=1024,
        num_beams=4,
        early_stopping=True
    )
    print("Query: ", esg)
    print("Correct: ", eng)
    print("Model: ", tokenizer.decode(output_ids[0], skip_special_tokens=True), "\n\n")

Query:  AND([FossilFuelInvolvement] ANY 'Production|Exploration',[FossilFuelRevShareMin] > '0')
Correct:  All companies or issuers that are involved in the production or exploration of fossil fuels and have a minimum percentage of revenue derived from fossil fuel business activities greater than 0.
Model:  Please provide a list of all companies or issuers that are involved in the production or exploration of fossil fuels and have a minimum percentage of revenue derived from fossil fuel involvement greater than 0. 


Query:  OR([TobaccoDistMaxRev] > '0.05',[TobaccoProdMaxRev] > '0',[TobaccoServMaxRev] > '0.05',[AlcoholServiceMaxRev] > '0.05',[AlcoholTotalProdMaxRev] == '0.05',[AlcoholDistributionMaxRev] > '0.05')
Correct:  Please provide a list of companies or issuers that have a tobacco-related revenue share of more than 5% or an alcohol-related revenue share of more than 5%, or a tobacco-related revenue share of more than 5% and an alcohol-related revenue share of more than 5%, or a t

In [48]:
excel = pd.read_excel("C:/Users/Jason Chen/Desktop/OU/OU24-25/CS4273/Esgish2_edited.xlsx")
sample = excel["Esgish"].dropna().sample(5).values

for esg in sample:
    input_text = "Convert to English: " + esg
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    output_ids = model.generate(
        **inputs,
        max_length=1024,
        num_beams=4,
        early_stopping=True
    )
    print("Query: ", esg)
    print("Model: ", tokenizer.decode(output_ids[0], skip_special_tokens=True), "\n\n")

Query:  OR([NBSOverallFlag] == 'RED',[CoalMiningRevShareMaxThermal] > '0.05',[PowGenRevShareCoalMax] > '0.05',[TobaccoProdMaxRev] > '0.05',[TobaccoDistMaxRev] > '0.05',[PornographyDistMaxRev] > '0',[PornographyProdMaxRev] > '0',[GamblingDistMaxRev] >= '0.05',[GamblingProdMaxRev] >= '0.05',[StemCellCloning] True,[APMinesOverallFlag] == 'RED',[BiologicalWeaponsOverallFlag] == 'RED',[ChemicalWeaponsOverallFlag] == 'RED',[ClusterMunitionsOverallFlag] == 'RED',[DepletedUraniumOverallFlag] == 'RED',[NuclearWeaponsOverallFlag] == 'RED',[NuclearWeaponsNonNPTOverallFlag] == 'RED',[WhitePhosphorusOverallFlag] == 'RED',[issuerID] == '0')
Model:  Please provide a list of companies or issuers that meet the following criteria: 1. NBS Overall Flag: Red 2. Coal Mining Revenue Share Max Percentage of Revenues Percentage of Revenues Percentage of Revenues Percentage of Revenues Percentage of Revenues Percentage of Revenues Percentage of Revenues Percentage of Revenues Percentage of Revenues Percentage o