In [1]:
!pip install kaggle kagglehub
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
!pip install transformers accelerate datasets bitsandbytes
!pip install sentencepiece peft


Looking in indexes: https://download.pytorch.org/whl/cu117
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.44.1-py3-none-manylinux_2

In [2]:
from google.colab import files
files.upload()    # Upload `kaggle.json`


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"errornautical","key":"02285ff06264cd839511ca386b885479"}'}

In [3]:
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [4]:
import kagglehub

# Download the dataset
path = kagglehub.dataset_download("vineetkukreti/indian-agriculture-dataset")
print("Path to dataset files:", path)


Downloading from https://www.kaggle.com/api/v1/datasets/download/vineetkukreti/indian-agriculture-dataset?dataset_version_number=1...


100%|██████████| 2.07M/2.07M [00:00<00:00, 69.5MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/vineetkukreti/indian-agriculture-dataset/versions/1





In [5]:
import os
print(os.listdir(path))


['ICRISAT-District Level Data.csv']


In [6]:
import pandas as pd
from datasets import Dataset

# Load dataset
df = pd.read_csv(f"{path}/ICRISAT-District Level Data.csv")  # Replace with your actual file name
print(df.head())

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split into training and validation sets
dataset = dataset.train_test_split(test_size=0.2)
print(dataset)


   Dist Code  Year  State Code    State Name Dist Name  RICE AREA (1000 ha)  \
0          1  1966          14  Chhattisgarh      Durg                548.0   
1          1  1967          14  Chhattisgarh      Durg                547.0   
2          1  1968          14  Chhattisgarh      Durg                556.3   
3          1  1969          14  Chhattisgarh      Durg                563.4   
4          1  1970          14  Chhattisgarh      Durg                571.6   

   RICE PRODUCTION (1000 tons)  RICE YIELD (Kg per ha)  WHEAT AREA (1000 ha)  \
0                        185.0                  337.59                  44.0   
1                        409.0                  747.71                  50.0   
2                        468.0                  841.27                  53.7   
3                        400.8                  711.40                  49.4   
4                        473.6                  828.55                  44.2   

   WHEAT PRODUCTION (1000 tons)  ...  SUGARC

In [7]:
from huggingface_hub import login

# Paste your Hugging Face token here
login("")


In [8]:
print(dataset)
print(dataset.column_names)


DatasetDict({
    train: Dataset({
        features: ['Dist Code', 'Year', 'State Code', 'State Name', 'Dist Name', 'RICE AREA (1000 ha)', 'RICE PRODUCTION (1000 tons)', 'RICE YIELD (Kg per ha)', 'WHEAT AREA (1000 ha)', 'WHEAT PRODUCTION (1000 tons)', 'WHEAT YIELD (Kg per ha)', 'KHARIF SORGHUM AREA (1000 ha)', 'KHARIF SORGHUM PRODUCTION (1000 tons)', 'KHARIF SORGHUM YIELD (Kg per ha)', 'RABI SORGHUM AREA (1000 ha)', 'RABI SORGHUM PRODUCTION (1000 tons)', 'RABI SORGHUM YIELD (Kg per ha)', 'SORGHUM AREA (1000 ha)', 'SORGHUM PRODUCTION (1000 tons)', 'SORGHUM YIELD (Kg per ha)', 'PEARL MILLET AREA (1000 ha)', 'PEARL MILLET PRODUCTION (1000 tons)', 'PEARL MILLET YIELD (Kg per ha)', 'MAIZE AREA (1000 ha)', 'MAIZE PRODUCTION (1000 tons)', 'MAIZE YIELD (Kg per ha)', 'FINGER MILLET AREA (1000 ha)', 'FINGER MILLET PRODUCTION (1000 tons)', 'FINGER MILLET YIELD (Kg per ha)', 'BARLEY AREA (1000 ha)', 'BARLEY PRODUCTION (1000 tons)', 'BARLEY YIELD (Kg per ha)', 'CHICKPEA AREA (1000 ha)', 'CHICKPEA P

In [9]:
import os

for root, dirs, files in os.walk("/"):
    for file in files:
        if "ICRISAT-District Level Data.csv" in file:
            print(f"Found file: {os.path.join(root, file)}")


Found file: /root/.cache/kagglehub/datasets/vineetkukreti/indian-agriculture-dataset/versions/1/ICRISAT-District Level Data.csv


In [10]:
from transformers import AutoTokenizer
from datasets import Dataset

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/mistral-7b-v0.1")

# Load the dataset and preprocess it
import pandas as pd

# Path to the dataset (update with your dataset path if needed)
csv_path = "/root/.cache/kagglehub/datasets/vineetkukreti/indian-agriculture-dataset/versions/1/ICRISAT-District Level Data.csv"
data = pd.read_csv(csv_path)

# Combine relevant columns to create a text field
data["text"] = data.apply(
    lambda row: f"In {row['Year']}, {row['Dist Name']} district of {row['State Name']} had "
                f"a rice area of {row['RICE AREA (1000 ha)']} thousand hectares, producing "
                f"{row['RICE PRODUCTION (1000 tons)']} thousand tons, with a yield of "
                f"{row['RICE YIELD (Kg per ha)']} kg per hectare.",
    axis=1
)

# Convert the dataframe to a Hugging Face Dataset
dataset = Dataset.from_pandas(data)

# Define the preprocessing function
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Verify the tokenized dataset
print(tokenized_dataset)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Map:   0%|          | 0/16146 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Dataset({
    features: ['Dist Code', 'Year', 'State Code', 'State Name', 'Dist Name', 'RICE AREA (1000 ha)', 'RICE PRODUCTION (1000 tons)', 'RICE YIELD (Kg per ha)', 'WHEAT AREA (1000 ha)', 'WHEAT PRODUCTION (1000 tons)', 'WHEAT YIELD (Kg per ha)', 'KHARIF SORGHUM AREA (1000 ha)', 'KHARIF SORGHUM PRODUCTION (1000 tons)', 'KHARIF SORGHUM YIELD (Kg per ha)', 'RABI SORGHUM AREA (1000 ha)', 'RABI SORGHUM PRODUCTION (1000 tons)', 'RABI SORGHUM YIELD (Kg per ha)', 'SORGHUM AREA (1000 ha)', 'SORGHUM PRODUCTION (1000 tons)', 'SORGHUM YIELD (Kg per ha)', 'PEARL MILLET AREA (1000 ha)', 'PEARL MILLET PRODUCTION (1000 tons)', 'PEARL MILLET YIELD (Kg per ha)', 'MAIZE AREA (1000 ha)', 'MAIZE PRODUCTION (1000 tons)', 'MAIZE YIELD (Kg per ha)', 'FINGER MILLET AREA (1000 ha)', 'FINGER MILLET PRODUCTION (1000 tons)', 'FINGER MILLET YIELD (Kg per ha)', 'BARLEY AREA (1000 ha)', 'BARLEY PRODUCTION (1000 tons)', 'BARLEY YIELD (Kg per ha)', 'CHICKPEA AREA (1000 ha)', 'CHICKPEA PRODUCTION (1000 tons)', 'CHIC

In [12]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("mistralai/mistral-7b-v0.1", device_map="auto")


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



In [16]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the dataset
csv_path = "/root/.cache/kagglehub/datasets/vineetkukreti/indian-agriculture-dataset/versions/1/ICRISAT-District Level Data.csv"
data = pd.read_csv(csv_path)

# Preprocess the dataset (if necessary, modify to suit your needs)
# For instance, you may create a single text column combining relevant columns
data["text"] = data["State Name"] + " " + data["Dist Name"] + " " + data["Year"].astype(str)

# Train-test split
train_texts, test_texts = train_test_split(data["text"], test_size=0.2, random_state=42)

# Convert to datasets format
from datasets import Dataset
train_dataset = Dataset.from_dict({"text": train_texts.tolist()})
test_dataset = Dataset.from_dict({"text": test_texts.tolist()})

print("Training dataset example:", train_dataset[0])
print("Testing dataset example:", test_dataset[0])


Training dataset example: {'text': 'Madhya Pradesh Gwalior 2016'}
Testing dataset example: {'text': 'Gujarat Rajkot 2015'}


In [18]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("mistralai/mistral-7b-v0.1")

def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

print("Tokenized training dataset example:", tokenized_train_dataset[0])
print("Tokenized testing dataset example:", tokenized_test_dataset[0])


Map:   0%|          | 0/12916 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/3230 [00:00<?, ? examples/s]

Tokenized training dataset example: {'text': 'Madhya Pradesh Gwalior 2016', 'input_ids': [1, 5311, 28716, 5157, 1921, 20830, 420, 18404, 1782, 28705, 28750, 28734, 28740, 28784], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Tokenized testing dataset example: {'text': 'Gujarat Rajkot 2015', 'input_ids': [1, 2480, 13036, 270, 16818, 28729, 322, 28705, 28750, 28734, 28740, 28782], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [14]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./mistral-finetuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    fp16=True,  # Mixed-precision training
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset
)

trainer.train()




RuntimeError: You can't move a model that has some modules offloaded to cpu or disk.