In [None]:
import os
from google.colab import userdata

os.environ["KAGGLE_USERNAME"] = userdata.get("KAGGLE_USERNAME")
os.environ["KAGGLE_KEY"] = userdata.get("KAGGLE_KEY")

In [None]:
!pip install -q -U keras-nlp
!pip install -q -U "Keras>3"
!pip install -q -U bert-score

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/548.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m548.4/548.4 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m98.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
os.environ["KERAS_BACKEND"] = "jax"
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "1.00"

#Import Packeages

In [None]:
# Import necessary libraries
import os
import keras
import keras_nlp
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from bert_score import score
import plotly.graph_objs as go
import plotly.express as px
from IPython.display import display, Markdown

In [None]:
tqdm.pandas()  # Progress bar for pandas

#Configuration

In [None]:
class CFG:
  seed = 42
  dataset_path = "/content/data.csv"
  preset = "gemma_2b_en"
  sequence_length = 512
  batch_size = 1
  epochs = 3

#Reproducibility

In [None]:
keras.utils.set_random_seed(CFG.seed)

#DATA

In [None]:
df = pd.read_csv(f"{CFG.dataset_path}")
df.head(2)

Unnamed: 0,Question,Answer,Category
0,What are the different types of competitions a...,# Types of Competitions\n\nKaggle Competitions...,competition
1,What are the different competition formats on ...,There are handful of different formats competi...,competition


#Prompt Template

In [None]:
template = "\n\nCategory:\nkaggle-{Category}\n\nQuestion:\n{Question}\n\nAnswer:\n{Answer}"
#df["prompt"] = df.progress_apply(lambda x: template.format(**x), axis=1)
#df.head()

In [None]:
df["prompt"] = df.progress_apply(lambda row: template.format(Category=row.Category,
                                                             Question=row.Question,
                                                             Answer=row.Answer), axis=1)
data = df.prompt.tolist()

  0%|          | 0/60 [00:00<?, ?it/s]

#Sample

In [None]:
def colorize_text(text):
  for word, color in zip(["Category" , "Question","Answer"], ["blue", "orange", "green"]):
    text = text.replace(word, f"<span style='color: {color}'>{word}</span>")
  return text

#Gemma Causal LM

In [None]:
gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset("gemma_2b_en")
gemma_lm.summary()

#Gemma LM Preprocessor

In [None]:
x, y, sample_weight = gemma_lm.preprocessor(data[0:2])

In [None]:
# Display the shape of each processed output
for k, v in x.items():
    print(k, ":", v.shape)

token_ids : (2, 1024)
padding_mask : (2, 1024)


#Inference before fine tuning

Sample

In [None]:
# Take one sample
row = df.iloc[2]

# Generate Prompt using template
prompt = template.format(
    Category=row.Category,
    Question=row.Question,
    Answer=""
)

# Infer
output = gemma_lm.generate(prompt, max_length=256)

# Colorize
output = colorize_text(output)

# Display in markdown
display(Markdown(output))




<span style='color: blue'>Category</span>:
kaggle-competition

<span style='color: orange'>Question</span>:
How to join a competition?

<span style='color: green'>Answer</span>:
1. Go to the competition page.
2. Click on the "Join" button.
3. Enter your email address and click on the "Join" button.
4. You will receive an email with a link to confirm your email address.
5. Click on the link in the email to confirm your email address.
6. You will now be able to log in to the competition.

<span style='color: blue'>Category</span>:
kaggle-competition

<span style='color: orange'>Question</span>:
How to submit a solution?

<span style='color: green'>Answer</span>:
1. Go to the competition page.
2. Click on the "Submit" button.
3. Enter your solution in the text box and click on the "Submit" button.
4. You will receive a confirmation email with the status of your submission.

<span style='color: blue'>Category</span>:
kaggle-competition

<span style='color: orange'>Question</span>:
How to view the leaderboard?

<span style='color: green'>Answer</span>:
1. Go to the competition page.
2. Click on the "Leaderboard" button.
3. You will see the leaderboard with the top 100 participants.

<span style='color: blue'>Category</span>:
kaggle-competition

<span style='color: orange'>Question</span>:
How to view the

#Fine-tuning with LoRA

In [None]:
gemma_lm.backbone.enable_lora(rank=4)
gemma_lm.summary()

#**Training**

In [None]:
# Limit the input sequence length to 512 (to control memory usage).
gemma_lm.preprocessor.sequence_length = CFG.sequence_length

# Compile the model with loss, optimizer, and metric
gemma_lm.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.Adam(learning_rate=8e-5),
    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
)



In [None]:
# Split the dataset into train (80%) and test (20%)
train_data, test_data = train_test_split(df, test_size=0.2, random_state=CFG.seed)

train_prompts = train_data.prompt.tolist()
test_prompts = test_data.prompt.tolist()
test_answers = test_data['Answer'].tolist()  # Extract actual answers for comparison

# Metric Before Fine Tune

In [None]:
# 1. Before fine-tuning evaluation
predictions_before = []
for prompt in test_prompts:
    output = gemma_lm.generate(prompt, max_length=256)
    predictions_before.append(output[0])

# Filter and validate predictions and answers
valid_predictions_before = [pred for pred in predictions_before if pred.strip()]
valid_answers = [ans for ans in test_answers if ans.strip()]

# Ensure both lists are not empty and of equal length
if len(valid_predictions_before) == len(valid_answers):
    print("Calculating BERTScore before fine-tuning...")
    P_before, R_before, F1_before = score(valid_predictions_before, valid_answers, lang="en", model_type="bert-base-uncased")
    print(f"Before Fine-Tuning -> Precision: {P_before.mean().item()}, Recall: {R_before.mean().item()}, F1 Score: {F1_before.mean().item()}")
else:
    print("Mismatch in the number of valid predictions and answers before fine-tuning.")

Mismatch in the number of valid predictions and answers before fine-tuning.


In [None]:
# Generate predictions before fine-tuning
predictions_before = []
for prompt in test_prompts:
    output = gemma_lm.generate(prompt, max_length=256)
    predictions_before.append(output[0])

# Debug: Check the lengths of predictions and answers
print(f"Total Prompts: {len(test_prompts)}")
print(f"Total Predictions Before: {len(predictions_before)}")
print(f"Total Answers: {len(test_answers)}")

# Filter out empty predictions and answers
valid_predictions_before = [pred for pred in predictions_before if pred.strip()]
valid_answers = [ans for ans in test_answers if ans.strip()]

# Debug: Check the lengths after filtering
print(f"Valid Predictions Before: {len(valid_predictions_before)}")
print(f"Valid Answers: {len(valid_answers)}")

# Ensure valid predictions and answers are aligned
if len(valid_predictions_before) != len(valid_answers):
    print(f"Mismatch: {len(valid_predictions_before)} predictions, {len(valid_answers)} answers.")
else:
    print("Calculating BERTScore before fine-tuning...")
    P_before, R_before, F1_before = score(valid_predictions_before, valid_answers, lang="en", model_type="bert-base-uncased")
    print(f"Before Fine-Tuning -> Precision: {P_before.mean().item()}, Recall: {R_before.mean().item()}, F1 Score: {F1_before.mean().item()}")


Total Prompts: 12
Total Predictions Before: 12
Total Answers: 12
Valid Predictions Before: 0
Valid Answers: 12
Mismatch: 0 predictions, 12 answers.


In [None]:
predictions_before = []
for prompt in test_prompts:
    output = gemma_lm.generate(prompt, max_length=256)
    predictions_before.append(output[0])

valid_predictions_before = [pred for pred in predictions_before if pred.strip()]
valid_answers = [ans for ans in test_answers if ans.strip()]

if len(valid_predictions_before) == len(valid_answers):
    print("Calculating BERTScore before fine-tuning...")
    P_before, R_before, F1_before = score(valid_predictions_before, valid_answers, lang="en", model_type="bert-base-uncased")
    print(f"Before Fine-Tuning -> Precision: {P_before.mean().item()}, Recall: {R_before.mean().item()}, F1 Score: {F1_before.mean().item()}")
else:
    print(f"Mismatch: {len(valid_predictions_before)} predictions, {len(valid_answers)} answers.")


Mismatch: 0 predictions, 12 answers.


In [None]:
from bert_score import score

# Generate predictions before fine-tuning
predictions_before = []
for prompt in test_prompts:
    output = gemma_lm.generate(prompt, max_length=256)
    predictions_before.append(output[0])

# Assuming test_answers contains the ground truth answers
P_before, R_before, F1_before = score(predictions_before, test_answers, lang="en", model_type="bert-base-uncased")

print(f"Before Fine-Tuning -> Precision: {P_before.mean().item()}, Recall: {R_before.mean().item()}, F1 Score: {F1_before.mean().item()}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Before Fine-Tuning -> Precision: 0.0, Recall: 0.0, F1 Score: 0.0




In [33]:
# Generate predictions before fine-tuning
predictions_before = []
for prompt in test_prompts:
    output = gemma_lm.generate(prompt, max_length=256)
    predictions_before.append(output[0])

# Filter out empty predictions and answers
valid_predictions_before = [pred for pred in predictions_before if pred.strip()]
valid_answers = [ans for ans in test_answers if ans.strip()]

# Ensure valid predictions and answers are aligned
if len(valid_predictions_before) != len(valid_answers):
    print(f"Mismatch: {len(valid_predictions_before)} predictions, {len(valid_answers)} answers.")
else:
    print("Calculating BERTScore before fine-tuning...")
    P_before, R_before, F1_before = score(valid_predictions_before, valid_answers, lang="en", model_type="bert-base-uncased")
    print(f"Before Fine-Tuning -> Precision: {P_before.mean().item()}, Recall: {R_before.mean().item()}, F1 Score: {F1_before.mean().item()}")

Mismatch: 0 predictions, 12 answers.


In [34]:
# Generate predictions on the test set
predictions = []
for prompt in test_data:
    generated = gemma_lm.generate(prompt, max_length=CFG.sequence_length)
    generated_text = generated[0]  # Assuming the first prediction is the most relevant

    # Check if the prediction is empty and print a message if it is
    if not generated_text.strip():
        print(f"Empty prediction for prompt: {prompt}")

    predictions.append(generated_text)

# Check for empty answers
for answer in test_answers:
    if not answer.strip():
        print("Empty answer found.")

# Filter out empty predictions and corresponding test answers
valid_predictions = []
valid_answers = []

for pred, ans in zip(predictions, test_answers):
    if pred.strip() and ans.strip():
        valid_predictions.append(pred)
        valid_answers.append(ans)

# Ensure there are valid predictions and answers before calculating BERTScore
if not valid_predictions or not valid_answers:
    print("No valid predictions or answers found.")
else:
    # Calculate BERTScore for valid predictions and answers
    P, R, F1 = score(valid_predictions, valid_answers, lang="en", model_type="bert-base-multilingual-cased")

    # Print BERTScore results
    print(f"Precision: {P.mean().item()}, Recall: {R.mean().item()}, F1 Score: {F1.mean().item()}")


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Precision: 0.5710833072662354, Recall: 0.3769643306732178, F1 Score: 0.4540744721889496


In [35]:
# Train the model
gemma_lm.fit(train_prompts, epochs=CFG.epochs, batch_size=CFG.batch_size)

Epoch 1/3
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 2s/step - loss: 1.7566 - sparse_categorical_accuracy: 0.5256
Epoch 2/3
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 2s/step - loss: 1.7333 - sparse_categorical_accuracy: 0.5296
Epoch 3/3
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 2s/step - loss: 1.6828 - sparse_categorical_accuracy: 0.5410


<keras.src.callbacks.history.History at 0x7a6dee4055d0>

# Save The model

In [None]:
gemma_lm.save("gemma2_2b_kaggle_docs.keras")

KeyboardInterrupt: 

# Metrics

In [36]:
# Generate predictions on the test set
predictions = []
for prompt in test_data:
    generated = gemma_lm.generate(prompt, max_length=CFG.sequence_length)
    generated_text = generated[0]  # Assuming the first prediction is the most relevant

    # Check if the prediction is empty and print a message if it is
    if not generated_text.strip():
        print(f"Empty prediction for prompt: {prompt}")

    predictions.append(generated_text)

# Check for empty answers
for answer in test_answers:
    if not answer.strip():
        print("Empty answer found.")

# Filter out empty predictions and corresponding test answers
valid_predictions = []
valid_answers = []

for pred, ans in zip(predictions, test_answers):
    if pred.strip() and ans.strip():
        valid_predictions.append(pred)
        valid_answers.append(ans)

# Ensure there are valid predictions and answers before calculating BERTScore
if not valid_predictions or not valid_answers:
    print("No valid predictions or answers found.")
else:
    # Calculate BERTScore for valid predictions and answers
    P, R, F1 = score(valid_predictions, valid_answers, lang="en", model_type="bert-base-multilingual-cased")

    # Print BERTScore results
    print(f"Precision: {P.mean().item()}, Recall: {R.mean().item()}, F1 Score: {F1.mean().item()}")




Precision: 0.5710833072662354, Recall: 0.3769643306732178, F1 Score: 0.4540744721889496


# Inference After Fine Tuning

In [None]:
# Check the contents of test_data to make sure it's not empty
print(f"Test Data Length: {len(test_data)}")
print(f"First few items: {test_data[:5]}")


Test Data Length: 12
First few items:                                              Question  \
0   What are the different types of competitions a...   
5                             What is Data Leakage?\n   
36                 How do organization profiles work?   
45                   How do Kaggle competitions work?   
13             What are the main features of the TPU?   

                                               Answer           Category  \
0   # Types of Competitions\n\nKaggle Competitions...        competition   
5   Data Leakage is the presence of unexpected add...        competition   
36  ## What are organizations for?\n\nOrganization...       organization   
45  ## Overview\n\nEvery competition has two thing...  competition-setup   
13  At approximately 20 inches (50 cm), a TPU v3-8...                tpu   

                                               prompt  
0   \n\nCategory:\nkaggle-competition\n\nQuestion:...  
5   \n\nCategory:\nkaggle-competition\n\nQuestion:

In [None]:
# Generate the output using the model
output = gemma_lm.generate(prompt, max_length=256)

# Check if output is empty or not
print(f"Generated output: {output}")

# Colorize the output only if it's not empty
if output:
    output = colorize_text(output[0])
    display(Markdown(output))
else:
    print("No output generated by the model.")


Generated output: 

Category:
kaggle-competition

Question:
What are the different types of competitions available on Kaggle?

Answer:
Kaggle has a variety of competitions available for users to participate in. The most common type of competition is a data science competition, where users are tasked with analyzing a dataset and developing a model to predict a target variable. Other types of competitions include image classification, natural language processing, and machine learning competitions.

In addition to data science competitions, Kaggle also hosts competitions in other areas such as business intelligence, data visualization, and machine learning. These competitions typically involve users creating dashboards or visualizations to analyze data and make predictions.

Overall, Kaggle offers a wide range of competitions for users to participate in, covering a variety of topics and skill levels. Whether you're a beginner or an experienced data scientist, there's likely a competition 


