In [None]:
!pip install transformers datasets torch pandas scikit-learn


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("singhnavjot2062001/11000-medicine-details")

print("Path to dataset files:", path)

In [4]:
import pandas as pd
import os
from datasets import Dataset


In [5]:
path = kagglehub.dataset_download("singhnavjot2062001/11000-medicine-details")

In [None]:
import os

print("Files in dataset folder:")
print(os.listdir(path))

In [None]:
# Load the correct file
csv_path = os.path.join(path, "Medicine_Details.csv")  # <-- update with actual filename
df = pd.read_csv(csv_path)
df.head()

In [8]:
# Keep only the required columns
df = df[['Composition', 'Uses', 'Side_effects']]

In [9]:
# Drop rows with missing values (optional but recommended)
df = df.dropna()

In [10]:
def create_input_output(row):
    input_text = row['Composition'].strip()
    uses = row['Uses'].strip()
    side_effects = row['Side_effects'].strip()
    output_text = f"Uses: {uses}; Side effects: {side_effects}"
    return pd.Series([input_text, output_text])

# Apply the transformation
df[['input', 'output']] = df.apply(create_input_output, axis=1)

In [11]:
df = df[['input', 'output']]
df.to_csv("medicine_data_cleaned.csv", index=False)
df[['input', 'output']].to_csv("medicine_data_for_model.csv", index=False)
print(df[['input', 'output']].head())




In [16]:
from datasets import Dataset
hf_dataset = Dataset.from_pandas(df)


In [18]:
from transformers import T5Tokenizer

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")


In [20]:
max_input_len = 64
max_output_len = 128

In [21]:
def preprocess(example):
    input_enc = tokenizer(
        example['input'],
        truncation=True,
        padding='max_length',
        max_length=max_input_len
    )
    output_enc = tokenizer(
        example['output'],
        truncation=True,
        padding='max_length',
        max_length=max_output_len
    )

    return {
        'input_ids': input_enc['input_ids'],
        'attention_mask': input_enc['attention_mask'],
        'labels': output_enc['input_ids']
    }

In [None]:
# Apply tokenization
tokenized_dataset = hf_dataset.map(preprocess, batched=False)

In [None]:
tokenized_dataset[:1]

In [24]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]


In [None]:
train_dataset


In [26]:
from transformers import T5ForConditionalGeneration
from transformers import TrainingArguments
from transformers import Trainer


In [None]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [34]:
training_args = TrainingArguments(
    output_dir="./medicine-t5-results",
    eval_strategy="epoch", # Corrected argument name
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    push_to_hub=False  # unless you're uploading to Hugging Face Hub
)

In [35]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
trainer.train()
