In [None]:
pip install wandb




In [None]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
import os

# Step 1: Load and inspect your Excel dataset
file_path = '/Book.xlsx'  # Replace with the uploaded file path
df = pd.read_excel(file_path)

# Inspect the first few rows to ensure it's loaded correctly
print(df.head())

# Step 2: Check for missing values and data types
print(df.info())  # Check data types and missing values

# If necessary, handle missing values by dropping or filling them
df = df.dropna(subset=['Home', 'Away', 'Outcome', 'Overs'])  # Remove rows with any missing values in these columns

# Ensure that 'Overs' is numeric (converting it to float if necessary)
df['Overs'] = pd.to_numeric(df['Overs'], errors='coerce')  # Converts to NaN if conversion fails
df = df.dropna(subset=['Overs'])  # Remove rows where 'Overs' is still NaN after conversion

# Step 3: Preprocess the data to create a textual representation for each row
# Combine Home, Away, Outcome, and Overs into a text description
df['text'] = df['Home'] + " vs " + df['Away'] + " - Winner: " + df['Outcome'] + " - Overs: " + df['Overs'].astype(str)

# Ensure there are no missing values in the 'text' column
df = df.dropna(subset=['text'])

# Step 4: Convert to Hugging Face Dataset format
dataset = Dataset.from_pandas(df[['text']])  # Only include the 'text' column

# Step 5: Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add the padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token # Using eos_token as pad_token

# Step 6: Tokenize the dataset
def tokenize_function(examples):
    # Tokenizing the 'text' column
    tokenized = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)
    # Create labels - shifted input_ids
    tokenized['labels'] = tokenized['input_ids'].copy()  # Labels are the same as input_ids for language modeling
    return tokenized

# Apply the tokenize function to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Step 7: Split the dataset into training and validation sets
train_dataset = tokenized_datasets.train_test_split(test_size=0.2)['train']
val_dataset = tokenized_datasets.train_test_split(test_size=0.2)['test']

# Step 8: Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # Output directory for results
    evaluation_strategy="epoch",     # Evaluate at the end of each epoch
    per_device_train_batch_size=4,   # Batch size for training
    per_device_eval_batch_size=4,    # Batch size for evaluation
    num_train_epochs=3,              # Number of epochs to train
    logging_dir="./logs",            # Directory for logs
    report_to=None
)

# Step 9: Initialize the GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Before initializing the Trainer, disable wandb environment variable
os.environ["WANDB_DISABLED"] = "true"

# Step 10: Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Step 11: Fine-tune the model
trainer.train()

# Step 12: Evaluate the model
trainer.evaluate()

# Step 13: Use the model for text generation
input_text = "TeamA vs TeamB - Winner: Home - Overs: 20"
inputs = tokenizer(input_text, return_tensors="pt", padding=True) # Add padding=True here

# Generate text (you can adjust max_length to control length of generated text)
outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=100) # Pass attention_mask

# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

# Step 14: Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_gpt2")
tokenizer.save_pretrained("./fine_tuned_gpt2")


  Match Time                   HomeTeam                  AwayTeam  Home Score  \
0      04:00  Athletic Bilbao (Aladdin)        Real Madrid (Pele)           3   
1      05:30  Athletic Bilbao (Aladdin)  Atletico Madrid (Xerxes)           1   
2      05:45  Athletic Bilbao (Aladdin)           Girona FC (Val)           1   
3      06:00  Athletic Bilbao (Aladdin)      FC Barcelona (Panic)           1   
4      06:30     FC Barcelona (Aladdin)         Sl Benfica (Pele)           3   

   Away Score     Home    Away  Outcome  Overs  
0           2  Aladdin    Pele  Aladdin    4.5  
1           4  Aladdin  Xerxes   Xerxes    4.5  
2           1  Aladdin     Val  Aladdin    0.0  
3           2  Aladdin   Panic    Panic    2.5  
4           4  Aladdin    Pele     Pele    4.5  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1041 entries, 0 to 1040
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Match Time  1041 non-null 

Map:   0%|          | 0/1041 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,No log,0.059298
2,No log,0.04435
3,0.119800,0.040762


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


TeamA vs TeamB - Winner: Home - Overs: 20.5


('./fine_tuned_gpt2/tokenizer_config.json',
 './fine_tuned_gpt2/special_tokens_map.json',
 './fine_tuned_gpt2/vocab.json',
 './fine_tuned_gpt2/merges.txt',
 './fine_tuned_gpt2/added_tokens.json')

In [23]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./fine_tuned_gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("./fine_tuned_gpt2")

# Read the excel file
excel_file_path = '/content/Test.xlsx' # Replace with your excel file path
df = pd.read_excel(excel_file_path)


# Iterate through rows and generate predictions
for index, row in df.iterrows():
        home_team = row['Home']
        away_team = row['Away']
        input_text = f"{home_team} vs {away_team}"
        inputs = tokenizer(input_text, return_tensors="pt", padding=True)
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=100)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Input: {input_text}, Prediction: {generated_text}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Panic vs Pele, Prediction: Panic vs Pele - Winner: Panic - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Thor vs Ivan, Prediction: Thor vs Ivan - Winner: Thor - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Aladdin vs Xerxes, Prediction: Aladdin vs Xerxes - Winner: Xerxes - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Zangief vs Madisson, Prediction: Zangief vs Madisson - Winner: Madisson - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Panic vs Aladdin, Prediction: Panic vs Aladdin - Winner: Panic - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Thor vs Persie, Prediction: Thor vs Persie - Winner: Thor - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Val vs Xerxes, Prediction: Val vs Xerxes - Winner: Xerxes - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Madisson vs Ivan, Prediction: Madisson vs Ivan - Winner: Madisson - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Pele vs Val, Prediction: Pele vs Val - Winner: Pele - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Madisson vs Thor, Prediction: Madisson vs Thor - Winner: Thor - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Zangief vs Persie, Prediction: Zangief vs Persie - Winner: Zangief - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Panic vs Xerxes, Prediction: Panic vs Xerxes - Winner: Panic - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Aladdin vs Val, Prediction: Aladdin vs Val - Winner: Aladdin - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Ivan vs Zangief, Prediction: Ivan vs Zangief - Winner: Ivan - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Madisson vs Persie, Prediction: Madisson vs Persie - Winner: Madisson - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Pele vs Xerxes, Prediction: Pele vs Xerxes - Winner: Pele - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Pele vs Aladdin, Prediction: Pele vs Aladdin - Winner: Pele - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Panic vs Val, Prediction: Panic vs Val - Winner: Panic - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Thor vs Zangief, Prediction: Thor vs Zangief - Winner: Thor - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Ivan vs Persie, Prediction: Ivan vs Persie - Winner: Persie - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Pele vs Panic, Prediction: Pele vs Panic - Winner: Pele - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Xerxes vs Aladdin, Prediction: Xerxes vs Aladdin - Winner: Xerxes - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Xerxes vs Val, Prediction: Xerxes vs Val - Winner: Xerxes - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Aladdin vs Panic, Prediction: Aladdin vs Panic - Winner: Aladdin - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Xerxes vs Panic, Prediction: Xerxes vs Panic - Winner: Xerxes - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Val vs Pele, Prediction: Val vs Pele - Winner: Val - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Val vs Aladdin, Prediction: Val vs Aladdin - Winner: Val - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Xerxes vs Pele, Prediction: Xerxes vs Pele - Winner: Xerxes - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Matrix vs Arthur, Prediction: Matrix vs Arthur - Winner: Professor - Overs: 0.0


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Adriano vs Samuel, Prediction: Adriano vs Samuel - Winner: Aladdin - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Misterx vs Nathan, Prediction: Misterx vs Nathan - Winner: Professor - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Sebastian vs Splinter, Prediction: Sebastian vs Splinter - Winner: Seagal - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Adriano vs Matrix, Prediction: Adriano vs Matrix - Winner: Adriano - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Logan vs Arthur, Prediction: Logan vs Arthur - Winner: Goliath - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Nathan vs Kai, Prediction: Nathan vs Kai - Winner: David - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Misterx vs Splinter, Prediction: Misterx vs Splinter - Winner: Professor - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Samuel vs Logan, Prediction: Samuel vs Logan - Winner: Jack - Overs: 0.0


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Adriano vs Arthur, Prediction: Adriano vs Arthur - Winner: Adriano - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Misterx vs Kai, Prediction: Misterx vs Kai - Winner: Professor - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Nathan vs Sebastian, Prediction: Nathan vs Sebastian - Winner: Nathan - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Samuel vs Arthur, Prediction: Samuel vs Arthur - Winner: Jack - Overs: 0.0


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Matrix vs Logan, Prediction: Matrix vs Logan - Winner: Professor - Overs: 0.0


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Nathan vs Splinter, Prediction: Nathan vs Splinter - Winner: Drake - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Sebastian vs Kai, Prediction: Sebastian vs Kai - Winner: Seagal - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Sebastian vs Misterx, Prediction: Sebastian vs Misterx - Winner: Sebastian - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Splinter vs Kai, Prediction: Splinter vs Kai - Winner: Zangief - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Liam vs Mateo, Prediction: Liam vs Mateo - Winner: David - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Juan vs Oliver, Prediction: Juan vs Oliver - Winner: David - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Nathan vs Misterx, Prediction: Nathan vs Misterx - Winner: Nathan - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Splinter vs Sebastian, Prediction: Splinter vs Sebastian - Winner: Thor - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Juan vs Liam, Prediction: Juan vs Liam - Winner: David - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Dominic vs Oliver, Prediction: Dominic vs Oliver - Winner: Carlos - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Kai vs Nathan, Prediction: Kai vs Nathan - Winner: Kai - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Splinter vs Misterx, Prediction: Splinter vs Misterx - Winner: Potter - Overs: 0.0


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Dominic vs Mateo, Prediction: Dominic vs Mateo - Winner: Carlos - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Oliver vs Liam, Prediction: Oliver vs Liam - Winner: Oliver - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Kai vs Misterx, Prediction: Kai vs Misterx - Winner: Madisson - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Sebastian vs Nathan, Prediction: Sebastian vs Nathan - Winner: Sato - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Dominic vs Liam, Prediction: Dominic vs Liam - Winner: David - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Mateo vs Juan, Prediction: Mateo vs Juan - Winner: Molotov - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Dominic vs Juan, Prediction: Dominic vs Juan - Winner: Carlos - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Mateo vs Oliver, Prediction: Mateo vs Oliver - Winner: Molotov - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Mateo vs Liam, Prediction: Mateo vs Liam - Winner: Molotov - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Oliver vs Juan, Prediction: Oliver vs Juan - Winner: Thor - Overs: 4.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Liam vs Juan, Prediction: Liam vs Juan - Winner: David - Overs: 4.5
Input: Oliver vs Dominic, Prediction: Oliver vs Dominic - Winner: David - Overs: 4.5
