# 🧠 Mini GPT Football

This project demonstrates the use of a language model to analyze soccer events from a text dataset. The goal is to generate automated responses that interpret descriptions of plays.

---

## 📁 Requirements

Before running this notebook, make sure you have the `football_dataset_en` file. You can upload it manually or download it from the GitHub repository once it's published.

```txt
⚠️ Link to add:
[Download football_dataset_en.zip](https://github.com/Ag78910/mini-gpt-football/raw/main/football_dataset_en.zip)




---

## 🛠️ Step 1: Install dependencies

Make sure you have the following libraries installed. If you're using Google Colab, you can run them directly:

```python
!pip install -q transformers
```


---

## 📥 Step 2: Upload the text file

This block reads the dataset contents from the `futbol_dataset.txt` file.

Make sure you've uploaded it before running.


In [None]:
# 📌 Project Description
# This project trains a GPT (transformer decoder) model with soccer phrases to generate realistic text about matches and plays. It uses PyTorch and can be run in Google Colab.
# Users can upload their own dataset or use an automatically generated one.

# 👥 Target Audience
# Ideal for students, AI enthusiasts, and data analysis professionals interested in NLP and natural language generation.

In [1]:
# Mini GPT Football -

# --------------------
# ⭐ Step 1: Installation and Configuration (skip if already installed))
!pip install torchtext --quiet
!pip install torch==2.0.1+cu118 torchtext==0.15.2 -f https://download.pytorch.org/whl/torch_stable.html


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m97.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m78.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
# 📂 Step 2: Upload phrase file
from google.colab import files
uploaded = files.upload()

Saving football_dataset_en.txt to football_dataset_en (1).txt


In [6]:
# 🤖 Step 3: Preparing the dataset
import torch
import torch.nn as nn
import math
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import random

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 2
BLOCK_SIZE = 32
EMBED_SIZE = 256
NHEAD = 4
NLAYERS = 4
EPOCHS = 100

file_name = "football_dataset_en.txt"
with open(file_name, "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]
train_data = [("futbol", line + " <|endoftext|>") for line in lines if len(line.strip().split()) >= 3]

special_tokens = ['<unk>', '<pad>', '<|endoftext|>']
UNK_IDX, PAD_IDX, EOS_IDX = 0, 1, 2
tokenizer = get_tokenizer("basic_english")

def yield_tokens(data):
    for _, text in data:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_data), specials=special_tokens, special_first=True)
vocab.set_default_index(UNK_IDX)

def text_pipeline(text):
    return vocab(tokenizer(text))

def get_sample(text, block_size):
    indices = text_pipeline(text)
    if len(indices) < 2:
        return torch.tensor(indices), torch.tensor(indices)
    if len(indices) < block_size + 1:
        indices += [PAD_IDX] * (block_size + 1 - len(indices))
    src = indices[:block_size]
    tgt = indices[1:block_size+1]
    return torch.tensor(src), torch.tensor(tgt)

def collate_batch(batch):
    src_batch, tgt_batch = [], []
    for _, text in batch:
        src, tgt = get_sample(text, BLOCK_SIZE)
        src_batch.append(src)
        tgt_batch.append(tgt)
    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch.to(DEVICE), tgt_batch.to(DEVICE)

dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)



In [7]:
# 🧱 Step 4: Define the model
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size, dropout=0.1, maxlen=5000):
        super().__init__()
        den = torch.exp(-torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).unsqueeze(1)
        pe = torch.zeros((maxlen, emb_size))
        pe[:, 0::2] = torch.sin(pos * den)
        pe[:, 1::2] = torch.cos(pos * den)
        self.pe = pe.unsqueeze(1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :].to(x.device)
        return self.dropout(x)

class MiniGPT(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, EMBED_SIZE)
        self.pos_enc = PositionalEncoding(EMBED_SIZE)
        encoder_layer = nn.TransformerEncoderLayer(d_model=EMBED_SIZE, nhead=NHEAD)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=NLAYERS)
        self.head = nn.Linear(EMBED_SIZE, vocab_size)

    def forward(self, src):
        mask = torch.triu(torch.ones((src.size(0), src.size(0)), device=src.device) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        x = self.embedding(src) * math.sqrt(EMBED_SIZE)
        x = self.pos_enc(x)
        x = self.transformer(x, mask)
        return self.head(x)

model = MiniGPT(len(vocab)).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [8]:
# ⏳ Step 5: Training
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for src, tgt in dataloader:
        if src.size(0) != tgt.size(0):
            continue
        output = model(src)
        loss = loss_fn(output.view(-1, output.size(-1)), tgt.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss/len(dataloader):.4f}")

Epoch 1/100, Loss: 4.2167
Epoch 2/100, Loss: 2.8970
Epoch 3/100, Loss: 1.6244
Epoch 4/100, Loss: 1.0066
Epoch 5/100, Loss: 0.7709
Epoch 6/100, Loss: 0.5536
Epoch 7/100, Loss: 0.4476
Epoch 8/100, Loss: 0.3653
Epoch 9/100, Loss: 0.2613
Epoch 10/100, Loss: 0.2328
Epoch 11/100, Loss: 0.1863
Epoch 12/100, Loss: 0.1697
Epoch 13/100, Loss: 0.1644
Epoch 14/100, Loss: 0.1564
Epoch 15/100, Loss: 0.0932
Epoch 16/100, Loss: 0.0843
Epoch 17/100, Loss: 0.1088
Epoch 18/100, Loss: 0.0611
Epoch 19/100, Loss: 0.0633
Epoch 20/100, Loss: 0.0730
Epoch 21/100, Loss: 0.0466
Epoch 22/100, Loss: 0.0675
Epoch 23/100, Loss: 0.0469
Epoch 24/100, Loss: 0.0354
Epoch 25/100, Loss: 0.0368
Epoch 26/100, Loss: 0.0377
Epoch 27/100, Loss: 0.0279
Epoch 28/100, Loss: 0.0228
Epoch 29/100, Loss: 0.0291
Epoch 30/100, Loss: 0.0308
Epoch 31/100, Loss: 0.0333
Epoch 32/100, Loss: 0.0245
Epoch 33/100, Loss: 0.0205
Epoch 34/100, Loss: 0.0198
Epoch 35/100, Loss: 0.0293
Epoch 36/100, Loss: 0.0293
Epoch 37/100, Loss: 0.0221
Epoch 38/1

In [9]:
# 🌟 Step 6: Generate football text (enhanced with sampling)

def generate(model, prompt, max_new_tokens=20, temperature=1.0, top_k=30, top_p=0.9):
    model.eval()
    tokens = text_pipeline(prompt)
    tokens_tensor = torch.tensor(tokens, dtype=torch.long).unsqueeze(1).to(DEVICE)
    for _ in range(max_new_tokens):
        with torch.no_grad():
            output = model(tokens_tensor)
        logits = output[-1, 0, :] / temperature

        for tok_id in set(tokens_tensor.squeeze().tolist()):
            logits[tok_id] -= 1.2

        logits[UNK_IDX] = float('-inf')

        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)

        sorted_indices_to_remove = cumulative_probs > top_p
        if sorted_indices_to_remove[0].item():
            sorted_indices_to_remove[0] = False
        logits[sorted_indices[sorted_indices_to_remove]] = float('-inf')

        if top_k > 0:
            indices_to_remove = logits < torch.topk(logits, top_k)[0][-1]
            logits[indices_to_remove] = float('-inf')

        probabilities = torch.softmax(logits, dim=0)
        next_token = torch.multinomial(probabilities, num_samples=1).unsqueeze(1)
        tokens_tensor = torch.cat([tokens_tensor, next_token], dim=0)
        if next_token.item() == EOS_IDX:
            break

    output_tokens = tokens_tensor.squeeze(1).tolist()
    return " ".join([vocab.get_itos()[idx] for idx in output_tokens])


In [11]:
# Test Prompt
prompt = "Halland score"
print("\nGenerated text:")
print(generate(model, prompt, temperature=1.2))



Generated text:
<unk> <unk> after a long pass from midfield . the coach praised his performance at the press conference . <|endoftext|>
