In [None]:
# Cell 1: Install required packages
# NOTE: running these installs in a notebook may require a restart for some packages.
!pip install -q pandas requests datasets transformers accelerate peft bitsandbytes sentencepiece python-dotenv

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install -q --upgrade torch torchvision torchaudio
!pip install -q --upgrade transformers accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m66.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Cell 2: Imports
import os
import re
import json
import pandas as pd
import requests
from dotenv import load_dotenv
from datasets import Dataset, load_dataset

In [None]:
HF_TOKEN = "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"

from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(model_name, token=HF_TOKEN)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Cell 3: Scrape the Worldometer page and parse the table
url = "https://www.worldometers.info/co2-emissions/nigeria-co2-emissions/"
resp = requests.get(url, headers={"User-Agent": "python-requests"})
resp.raise_for_status()
# pandas.read_html will find tables in page
tables = pd.read_html(resp.text)

# Inspect tables to find the one with 'Year' and 'Fossil CO2 emissions' or similar column names
for i, t in enumerate(tables):
    print(i, t.columns.tolist(), t.shape)

0 ['Unnamed: 0', 'Fossil CO2 emissions (tons)', 'CO2 emissions change', 'CO2 emissions per capita', 'Population', 'Pop. change', "Share of World's CO2 emissions"] (46, 7)


  tables = pd.read_html(resp.text)


In [None]:
# Cell 4: Select and clean the table (adjust index if needed)
table_index = 0
df = tables[table_index].copy()

# Try to find likely column names
print(df.columns)

Index(['Unnamed: 0', 'Fossil CO2 emissions (tons)', 'CO2 emissions change',
       'CO2 emissions per capita', 'Population', 'Pop. change',
       'Share of World's CO2 emissions'],
      dtype='object')


In [None]:
# Normalize column names to simpler keys
cols = {c: re.sub(r"\s*\(.*\)", "", str(c)).strip() for c in df.columns}
df.rename(columns=cols, inplace=True)
print("Normalized columns:", df.columns.tolist())

Normalized columns: ['Unnamed: 0', 'Fossil CO2 emissions', 'CO2 emissions change', 'CO2 emissions per capita', 'Population', 'Pop. change', "Share of World's CO2 emissions"]


In [None]:
# Keep the most relevant columns, e.g., Year and fossil CO2 emissions
# Try common column names, adjust if your table uses different names:
possible_em_col_names = [c for c in df.columns if "CO2" in c or "Fossil" in c or "emissions" in c.lower()]
print("Possible emission columns:", possible_em_col_names)

Possible emission columns: ['Fossil CO2 emissions', 'CO2 emissions change', 'CO2 emissions per capita', "Share of World's CO2 emissions"]


In [None]:
# choose first candidate:
em_col = possible_em_col_names[0]
print("Using emissions column:", em_col)

# Clean numbers: remove commas, footnotes, percent signs, text
def clean_number(x):
    if pd.isna(x):
        return None
    s = str(x)
    s = re.sub(r"[^\d\.\-]", "", s)  # keep digits, dot, minus
    if s == "":
        return None
    try:
        if "." in s:
            return float(s)
        return int(s)
    except:
        return None

Using emissions column: Fossil CO2 emissions


In [None]:
df['year'] = df['Unnamed: 0'].astype(str).str.extract(r"(\d{4})")[0].astype(int)
df['co2_tons'] = df[em_col].apply(clean_number)
df = df[['year', 'co2_tons']].dropna().sort_values('year')
df.head(10)

Unnamed: 0,year,co2_tons
45,1977,66469720
44,1978,69536060
43,1979,99426960
42,1980,89601710
41,1981,73301610
40,1982,69883020
39,1983,69975610
38,1984,70293450
37,1985,73212450
36,1986,71536990


In [None]:
# Cell 5: Make simple instruction/response pairs for supervised fine-tuning
records = []
for _, row in df.iterrows():
    year = int(row['year'])
    co2 = int(row['co2_tons'])
    instruction = f"What were Nigeria's fossil CO2 emissions (tons) in {year}?"
    output = f"In {year}, Nigeria's fossil CO2 emissions were {co2:,} tons."
    records.append({"instruction": instruction, "output": output, "year": year, "co2_tons": co2})

# Convert to dataframe and save preview
train_df = pd.DataFrame(records)
train_df.head(8)

Unnamed: 0,instruction,output,year,co2_tons
0,What were Nigeria's fossil CO2 emissions (tons...,"In 1977, Nigeria's fossil CO2 emissions were 6...",1977,66469720
1,What were Nigeria's fossil CO2 emissions (tons...,"In 1978, Nigeria's fossil CO2 emissions were 6...",1978,69536060
2,What were Nigeria's fossil CO2 emissions (tons...,"In 1979, Nigeria's fossil CO2 emissions were 9...",1979,99426960
3,What were Nigeria's fossil CO2 emissions (tons...,"In 1980, Nigeria's fossil CO2 emissions were 8...",1980,89601710
4,What were Nigeria's fossil CO2 emissions (tons...,"In 1981, Nigeria's fossil CO2 emissions were 7...",1981,73301610
5,What were Nigeria's fossil CO2 emissions (tons...,"In 1982, Nigeria's fossil CO2 emissions were 6...",1982,69883020
6,What were Nigeria's fossil CO2 emissions (tons...,"In 1983, Nigeria's fossil CO2 emissions were 6...",1983,69975610
7,What were Nigeria's fossil CO2 emissions (tons...,"In 1984, Nigeria's fossil CO2 emissions were 7...",1984,70293450


In [None]:
# Cell 6: Save as JSONL for SFT
out_jsonl = "nigeria_co2_sft.jsonl"
with open(out_jsonl, "w", encoding="utf-8") as f:
    for r in records:
        # format for supervised finetuning SFT (simple instruction->output)
        # many trainers expect {"prompt": "...", "completion": "..."} or Hugging Face "input_text"/"target_text".
        # We'll use "input_text" and "target_text".
        f.write(json.dumps({"input_text": r["instruction"], "target_text": r["output"]}, ensure_ascii=False) + "\n")

print("Saved", out_jsonl, "with", len(records), "examples")

Saved nigeria_co2_sft.jsonl with 46 examples


In [None]:
# Cell 7: Quick local test (load jsonl as HuggingFace dataset)
ds = load_dataset("json", data_files=out_jsonl, split="train")
print(ds[0])

Generating train split: 0 examples [00:00, ? examples/s]

{'input_text': "What were Nigeria's fossil CO2 emissions (tons) in 1977?", 'target_text': "In 1977, Nigeria's fossil CO2 emissions were 66,469,720 tons."}


In [None]:
# ---------- FINETUNING----------
# Cell 8: Fine-tuning template using Hugging Face Trainer + PEFT (LoRA)

# Minimal example; adapt model_name and training args to your environment.
model_name = "meta-llama/Llama-2-7b-chat-hf"  # replace with model you have access to
output_dir = "./lora_nigeria_co2"
jsonl_train = out_jsonl

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
# Ensure tokenizer.pad_token exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
# Load model (8-bit + prepare for k-bit training helps memory)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,      # requires bitsandbytes
    device_map="auto",
    trust_remote_code=True  # some community LLaMA implementations need this
)
model = prepare_model_for_kbit_training(model)

In [None]:
# Configure LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # adjust depending on model architecture
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [None]:
# Prepare dataset tokenizer function
def preprocess_fn(examples):
    # We will concatenate input and target into one sequence for causal LM
    inputs = []
    for instr, tgt in zip(examples["input_text"], examples["target_text"]):
        # Example formatting: "<s>Instruction: ...\nAnswer: ...</s>"
        text = f"Instruction: {instr}\nAnswer: {tgt}"
        inputs.append(text)
    tokenized = tokenizer(inputs, truncation=True, padding="longest", max_length=512)
    # labels are input ids (causal LM)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

In [None]:
ds = load_dataset("json", data_files=jsonl_train, split="train")
# Rename columns if needed
if "input_text" not in ds.column_names:
    raise ValueError("Expected column 'input_text' in the dataset JSONL")

tokenized_ds = ds.map(preprocess_fn, batched=True, remove_columns=ds.column_names)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
# Training args
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,    # tune based on your GPU
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=3e-4,
    fp16=True,
    logging_steps=10,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    data_collator=data_collator,
)