In [6]:
import pandas as pd
from simplet5 import SimpleT5
from sklearn.model_selection import train_test_split
import torch

# Step 1: Read the CSV file
data = pd.read_csv('Updated_dataset.csv')
data['source_text'].fillna('No text', inplace=True)
data['target_text'].fillna('No text', inplace=True)

# Ensure all data is string type
data['source_text'] = data['source_text'].astype(str)
data['target_text'] = data['target_text'].astype(str)

# Ensure the DataFrame has the required columns
assert 'source_text' in data.columns and 'target_text' in data.columns, "CSV must contain 'source_text' and 'target_text' columns."

# Step 2: Split the data into training and testing sets (80% training, 20% testing)
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

# No need to rename columns as they already have the correct names

# Step 3: Initialize SimpleT5
model = SimpleT5()
model.from_pretrained("t5", "t5-small")

# Step 4: Train the model (with GPU)
model.train(train_df=train_df,
            eval_df=test_df,
            source_max_token_len=128,
            target_max_token_len=50,
            batch_size=16,
            max_epochs=5,
            use_gpu=True)  # Set use_gpu=False for CPU usage

# Step 5: Set the device manually
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.model.to(device)

# Step 6: Make predictions (assuming you have input text)
def generate_text(input_text):
    input_ids = model.tokenizer.encode(input_text, return_tensors='pt', max_length=128, truncation=True).to(device)
    outputs = model.model.generate(input_ids, max_length=50, num_beams=4, early_stopping=True)
    return model.tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example prediction
output = generate_text("For luxury woodworking projects, which tool offers precision and efficiency in planing wooden materials?")
print(output)


INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  self.pid = os.fork()
INFO:pytorch_lightning.utilities.seed:Global seed set to 42


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

  self.pid = os.fork()


Planers


In [4]:
pip install simplet5



In [9]:
import pandas as pd
from simplet5 import SimpleT5
from sklearn.model_selection import train_test_split
import torch
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
import numpy as np

# Download the NLTK data for BLEU
nltk.download('punkt')

# Step 1: Read the CSV file
data = pd.read_csv('Updated_dataset.csv')
data['source_text'].fillna('No text', inplace=True)
data['target_text'].fillna('No text', inplace=True)
# Ensure all data is string type
data['source_text'] = data['source_text'].astype(str)
data['target_text'] = data['target_text'].astype(str)


# Ensure the DataFrame has the required columns
assert 'source_text' in data.columns and 'target_text' in data.columns, "CSV must contain 'source_text' and 'target_text' columns."

# Step 2: Split the data into training and testing sets (80% training, 20% testing)
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

# Step 3: Initialize SimpleT5
model = SimpleT5()
model.from_pretrained("t5", "t5-small")

# Step 4: Train the model (without GPU)
model.train(train_df=train_df,
            eval_df=test_df,
            source_max_token_len=256,  # Increased max token length
            target_max_token_len=128,  # Increased max token length
            batch_size=8,              # Reduced batch size if memory is an issue
            max_epochs=10,             # Increased number of epochs
            use_gpu=True)             # Set use_gpu=True if you have a GPU

# Step 5: Set the device manually
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.model.to(device)

# Step 6: Make predictions and calculate BLEU score
def generate_text(input_text):
    input_ids = model.tokenizer.encode(input_text, return_tensors='pt', max_length=256, truncation=True).to(device)
    outputs = model.model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    return model.tokenizer.decode(outputs[0], skip_special_tokens=True)

# Function to calculate BLEU score for the test set
def calculate_bleu_score(test_df):
    references = []
    candidates = []
    bleu_scores = []
    for _, row in test_df.iterrows():
        source_text = row['source_text']
        target_text = row['target_text']
        predicted_text = generate_text(source_text)

        # Tokenize the target and predicted text
        reference = [nltk.word_tokenize(target_text)]
        candidate = nltk.word_tokenize(predicted_text)

        # Calculate BLEU score
        bleu_score = sentence_bleu(reference, candidate)
        bleu_scores.append(bleu_score)

        references.append(reference)
        candidates.append(candidate)

        print(f"Source: {source_text}")
        print(f"Target: {target_text}")
        print(f"Predicted: {predicted_text}")
        print(f"BLEU score: {bleu_score}\n")

    average_bleu_score = sum(bleu_scores) / len(bleu_scores)
    corpus_bleu_score = corpus_bleu(references, candidates)
    return average_bleu_score, corpus_bleu_score, bleu_scores

# Calculate the BLEU score for the test set
average_bleu_score, corpus_bleu_score, bleu_scores = calculate_bleu_score(test_df)
print(f"Average sentence-level BLEU score: {average_bleu_score}")
print(f"Corpus-level BLEU score: {corpus_bleu_score}")
print(f"BLEU score standard deviation: {np.std(bleu_scores)}")
print(f"BLEU score median: {np.median(bleu_scores)}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  self.pid = os.fork()
INFO:pytorch_lightning.utilities.seed:Global seed set to 42


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Source: No text
Target: Delta Sanders
Predicted: Hand Sanding
BLEU score: 0

Source: For professionals working on construction sites, what tool offers optimal performance?
Target: IMPACT DRILLER
Predicted: Reciprocating Saw
BLEU score: 0

Source: No text
Target: Belt Sanders
Predicted: Large Angle Grinders
BLEU score: 0

Source: 8. What is the maximum depth of cut?
Target: Palm Routers
Predicted: Routers
BLEU score: 6.702145341854094e-232

Source: No text
Target: Table Circular Saw
Predicted: Hand Sanding
BLEU score: 0



The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Source: What are the key features to consider when choosing a hand-held dry cutter?
Target: Hand Held Dry Cutters
Predicted: Hand Held Dry Cutters
BLEU score: 1.0

Source: Who is the target audience for this tool, professionals, hobbyists, or both?
Target: Hand Held Dry Cutters
Predicted: Hand Held Dry Cutters
BLEU score: 1.0

Source: What power tool is ideal for sanding metal materials without causing damage?
Target: Orbital Sanders
Predicted: Multi Sanders
BLEU score: 1.5319719891192393e-231

Source: No text
Target: Straight Grinder
Predicted: Drum Sander
BLEU score: 0

Source: When shaping or smoothing curved edges, which tool offers precision and control?
Target: Drum Sander
Predicted: Straight Grinder
BLEU score: 0

Source: What is involved in the cutting process of a vertical panel saw?
Target: Vertical Panel Saw
Predicted: Vertical Panel Saw
BLEU score: 1.2213386697554703e-77

Source: No text
Target: Table Circular Saw
Predicted: Multi Sanders
BLEU score: 0

Source: No text
Targ