In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = 'tiiuae/falcon-7b'

In [3]:
crop_df = pd.read_csv('data/APY.csv')
crop_df.head()

Unnamed: 0,State,District,Crop,Crop_Year,Season,Area,Production,Yield
0,Andaman and Nicobar Island,NICOBARS,Arecanut,2007,Kharif,2439.6,3415.0,1.4
1,Andaman and Nicobar Island,NICOBARS,Arecanut,2007,Rabi,1626.4,2277.0,1.4
2,Andaman and Nicobar Island,NICOBARS,Arecanut,2008,Autumn,4147.0,3060.0,0.74
3,Andaman and Nicobar Island,NICOBARS,Arecanut,2008,Summer,4147.0,2660.0,0.64
4,Andaman and Nicobar Island,NICOBARS,Arecanut,2009,Autumn,4153.0,3120.0,0.75


In [4]:
# stripping whitespace from season col
crop_df['Season'] = crop_df['Season'].str.strip()

# creating a summary line for llm consumption
crop_df['summary'] = 'The state of ' + crop_df['State'] + ' had a crop yield of ' \
+ crop_df['Yield'].astype(str) + ' in the year ' + crop_df['Crop_Year'].astype(str) + ' during the ' + crop_df['Season'] + ' season.'

crop_df.head()

Unnamed: 0,State,District,Crop,Crop_Year,Season,Area,Production,Yield,summary
0,Andaman and Nicobar Island,NICOBARS,Arecanut,2007,Kharif,2439.6,3415.0,1.4,The state of Andaman and Nicobar Island had a ...
1,Andaman and Nicobar Island,NICOBARS,Arecanut,2007,Rabi,1626.4,2277.0,1.4,The state of Andaman and Nicobar Island had a ...
2,Andaman and Nicobar Island,NICOBARS,Arecanut,2008,Autumn,4147.0,3060.0,0.74,The state of Andaman and Nicobar Island had a ...
3,Andaman and Nicobar Island,NICOBARS,Arecanut,2008,Summer,4147.0,2660.0,0.64,The state of Andaman and Nicobar Island had a ...
4,Andaman and Nicobar Island,NICOBARS,Arecanut,2009,Autumn,4153.0,3120.0,0.75,The state of Andaman and Nicobar Island had a ...


In [5]:
summary_df = crop_df[['summary']]
summary_df.head()

Unnamed: 0,summary
0,The state of Andaman and Nicobar Island had a ...
1,The state of Andaman and Nicobar Island had a ...
2,The state of Andaman and Nicobar Island had a ...
3,The state of Andaman and Nicobar Island had a ...
4,The state of Andaman and Nicobar Island had a ...


In [6]:
# converting summary_df to HuggingFace dataset
dataset = Dataset.from_pandas(summary_df)
dataset['summary'][100]

'The state of Andaman and Nicobar Island had a crop yield of 7.14 in the year 2002 during the Whole Year season.'

In [7]:
%%time
# tokenizing summary_df
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def tokenize_fn(examples):
    return tokenizer(examples['summary'], padding='max_length', truncation=True)

tokenized_dataset = dataset.map(tokenize_fn) #, batched=True)

Map: 100%|█████████████████████| 345336/345336 [03:10<00:00, 1811.43 examples/s]

CPU times: user 2min 54s, sys: 9.66 s, total: 3min 4s
Wall time: 3min 11s





In [8]:
tokenized_dataset

Dataset({
    features: ['summary', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 345336
})

In [10]:
# training the model
model = AutoModelForSequenceClassification.from_pretrained(model_name)

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

Downloading (…)lve/main/config.json: 100%|██████| 950/950 [00:00<00:00, 845kB/s]


ValueError: Loading tiiuae/falcon-7b requires you to execute the configuration file in that repo on your local machine. Make sure you have read the code there to avoid malicious use, then set the option `trust_remote_code=True` to remove this error.

In [None]:
%%time
trainer.train()