In [None]:
!pip install transformers[torch] -q
!pip install accelerate -U -q
!pip install torch -q
!pip install datasets -q


!pip install --upgrade transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m75.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m68.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd

# Load the climate data
df = pd.read_csv('climate_change_faqs.csv')

In [None]:
# Print the df to see if it is loaded well
df.head()

Unnamed: 0,source,faq,text_type
0,https://www.ipcc.ch/site/assets/uploads/2020/0...,If Understanding of the Climate System Has Inc...,q
1,https://www.ipcc.ch/site/assets/uploads/2020/0...,The models used to calculate the IPCC’s temper...,a
2,https://www.ipcc.ch/site/assets/uploads/2020/0...,How Do We Know the World Has Warmed?,q
3,https://www.ipcc.ch/site/assets/uploads/2020/0...,Evidence for a warming world comes from multip...,a
4,https://www.ipcc.ch/site/assets/uploads/2020/0...,Have There Been Any Changes in Climate Extremes?,q


# Preprocessing

In [None]:
# Load the required columns which are faq and text_type

df.drop('source', axis=1,inplace=True)

df.head()

Unnamed: 0,faq,text_type
0,If Understanding of the Climate System Has Inc...,q
1,The models used to calculate the IPCC’s temper...,a
2,How Do We Know the World Has Warmed?,q
3,Evidence for a warming world comes from multip...,a
4,Have There Been Any Changes in Climate Extremes?,q


In [None]:
# Separate the questions and answers
total = len(df)

questions = []
answers = []

for index, row in df.iterrows():
  text = row[0]
  text_type = row[1]
  if text_type == 'q':
    questions.append((text.lower()).replace('\n',' '))
  else:
    answers.append((text.lower()).replace('\n',' '))

new_df = pd.DataFrame({
    'question': questions,
    'answer': answers
})

In [None]:
# Print the normalized data

new_df.head()

Unnamed: 0,question,answer
0,if understanding of the climate system has inc...,the models used to calculate the ipcc’s temper...
1,how do we know the world has warmed?,evidence for a warming world comes from multip...
2,have there been any changes in climate extremes?,there is strong evidence that warming has lead...
3,is the ocean warming?,"yes, the ocean is warming over many regions, d..."
4,is there evidence for changes in the earth’s w...,the earth’s water cycle involves evaporation a...


In [None]:
import unicodedata
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return ''.join([c for c in nfkd_form if not unicodedata.combining(c)])

normal = []
for text in new_df['answer']:
  normal.append(remove_accents(text))

In [None]:
new_df['answer'] = normal

# Prepare data for the model

In [None]:
# Combine the df into a better and readable format for the model

normalized_df = '[Q]: ' + new_df['question'] + '\n[A]: ' + new_df['answer']

normalized_df.head()

0    [Q]: if understanding of the climate system ha...
1    [Q]: how do we know the world has warmed?\n[A]...
2    [Q]: have there been any changes in climate ex...
3    [Q]: is the ocean warming?\n[A]: yes, the ocea...
4    [Q]: is there evidence for changes in the eart...
dtype: object

In [None]:
normalized_df.to_csv('climate_train.txt', sep='\n', index=False, header=False)

# Load Model

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import torch

train_dataset_path = 'climate_train.txt'
model_name = "gpt2"
model_output_path = 'model_output/'

# Load GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

device = "cuda:0" if torch.cuda.is_available() else "cpu"
tokenizer.pad_token = tokenizer.eos_token

# Load training dataset
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=train_dataset_path,
    block_size=128)


# Initialize DataCollator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # GPT-2 is an autoregressive model, not masked
)


# Create data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False)

# Set training arguments
training_args = TrainingArguments(
    output_dir=model_output_path,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=5_000,
    save_total_limit=10,
)

# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [None]:
trainer.train()

Step,Training Loss


In [None]:
new_text = ['What is Climate Change?']

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = model.to(device)


predicted_text_ids = []

for conversation in new_text:
  model_input = '[Q]:' + conversation.lower() + '\n[A]: '
  ids = tokenizer.encode(model_input, return_tensors='pt').to(device)
  predicted_text_ids.append(
      model.generate(
        ids,
        do_sample=True,
        max_length=128,
        pad_token_id=model.config.eos_token_id,
        top_k=5,
        top_p=0.97
    )
  )

In [None]:
generated_texts = []

for predicted_text_id in predicted_text_ids:
  generated_texts.append(tokenizer.decode(predicted_text_id[0], skip_special_tokens=True))

for generated_text in generated_texts:
  print(generated_text)

[Q]:what is climate change?
[A]:   global average temperature rise is based on a number of factors – from a number of factors – such as changes in precipitation, land use change, ocean acidification and land-mass change. however, the most important factor is the overall rate of change in the rate of temperature change.  the rate of change in temperature varies across countries and regions. for example, the rate of change of the earth’s temperature rise depends on many other factors, including the rate and extent of ocean acidification.  the most important climate variables in the global record are those that influence


In [None]:
# Save the fine-tuned model
model.save_pretrained('fine-tuned/model/')
tokenizer.save_pretrained('fine-tuned/tokenizer/')

('fine-tuned/tokenizer/tokenizer_config.json',
 'fine-tuned/tokenizer/special_tokens_map.json',
 'fine-tuned/tokenizer/vocab.json',
 'fine-tuned/tokenizer/merges.txt',
 'fine-tuned/tokenizer/added_tokens.json')