# Creating an NBA Large-Language-Model using Falcon Framework

In [2]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datasets import Dataset
from transformers import default_data_collator
import re
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig, default_data_collator
from peft import LoraConfig, get_peft_model
from torch.utils.data import DataLoader

In [3]:
# First, ensure proper initialization
import ipywidgets as widgets
from IPython.display import display, clear_output

# Reset widget state
clear_output(wait=True)

In [4]:
import torch
print("GPU available:", torch.cuda.is_available())
print("GPU device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")
torch.cuda.empty_cache()

GPU available: True
GPU device name: NVIDIA GeForce RTX 4060 Laptop GPU


In [5]:
def scrape_nba_data():
    urls = [
        # Advanced Stats
        "https://www.basketball-reference.com/leaders/per_career.html",
        "https://www.basketball-reference.com/leaders/ws_career.html", 
        "https://www.basketball-reference.com/leaders/bpm_career.html",
        "https://www.basketball-reference.com/leaders/vorp_career.html",
        
        # All-Time Great Teams
        "https://www.basketball-reference.com/teams/CHI/1996.html",
        "https://www.basketball-reference.com/teams/GSW/2016.html", 
        "https://www.basketball-reference.com/teams/LAL/1972.html",
        "https://www.basketball-reference.com/teams/BOS/1986.html",
        
        # Hall of Fame Players
        "https://www.basketball-reference.com/players/r/russebi01.html",
        "https://www.basketball-reference.com/players/o/olajuha01.html",
        "https://www.basketball-reference.com/players/m/malonka01.html",
        "https://www.basketball-reference.com/players/e/ervinju01.html",
        
        # Season Leaders
        "https://www.basketball-reference.com/leaders/pts_season.html",
        "https://www.basketball-reference.com/leaders/ast_season.html",
        "https://www.basketball-reference.com/leaders/reb_season.html",
        "https://www.basketball-reference.com/leaders/blk_season.html",
        
        # All-Star Games
        "https://www.basketball-reference.com/allstar/",
        "https://www.basketball-reference.com/allstar/NBA_2023.html",
        
        # Historical Seasons
        "https://www.basketball-reference.com/leagues/NBA_1996.html",
        "https://www.basketball-reference.com/leagues/NBA_1986.html",
        "https://www.basketball-reference.com/leagues/NBA_1972.html",
        
        # Additional Open Sources
        "https://www.landofbasketball.com/records/",
        "https://www.landofbasketball.com/all_time_leaders/",
        "https://www.landofbasketball.com/year_by_year.htm",
        "https://hoopshype.com/salaries/",
        "https://www.realgm.com/nba/stats/",
        "https://www.proballers.com/basketball/",
        "https://www.usbasket.com/",
        "https://basketball.realgm.com/nba/awards/"
    ]
    
    all_data = []
    for url in urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        stats = extract_stats(soup)
        cleaned_stats = clean_training_data(stats)
        all_data.append(cleaned_stats)
    
    return "\n\n".join(all_data)

def clean_training_data(text):
    cleaned = re.sub(r'(\* Indicates.*?\n)(?=\* Indicates)', '', text, flags=re.MULTILINE)
    cleaned = re.sub(r'(Active players.*?\n)(?=Active players)', '', cleaned, flags=re.MULTILINE)
    cleaned = re.sub(r'\s+', ' ', cleaned)
    return cleaned.strip()

def extract_stats(soup):
    stats_text = "NBA Career Scoring Leaders:\n"
    
    # Extract scoring table with clear formatting
    scoring_leaders = soup.find('table', {'id': 'leaders'})
    if scoring_leaders:
        rows = scoring_leaders.find_all('tr')
        for row in rows[1:]:  # Skip header
            cols = row.find_all('td')
            if cols:
                rank = cols[0].text.strip()
                player = cols[1].text.strip()
                points = cols[2].text.strip()
                stats_text += f"Rank {rank}: {player} - {points} points\n"
    
    # Add contextual sentences
    stats_text += "\nKey NBA Scoring Milestones:\n"
    stats_text += "LeBron James broke the all-time scoring record on February 7, 2023\n"
    stats_text += "Previous record holder was Kareem Abdul-Jabbar with 38,387 points\n"
    
    return stats_text

scraped_data = scrape_nba_data()

In [6]:
# Save the scraped data to a text file
scraped_data = scrape_nba_data()
with open("nba_data.txt", "w", encoding='utf-8') as f:
    f.write(scraped_data)

# Step 2: Preprocessing the Data
def preprocess_data(file_path):
    """Loads and preprocesses the text data."""
    with open(file_path, "r", encoding='utf-8') as f:
        text = f.read()
    return text

preprocessed_text = preprocess_data("nba_data.txt")

# Convert to a dataset format
data_dict = {"text": [preprocessed_text]}
dataset = Dataset.from_dict(data_dict)

# Set up quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

# Step 3: Fine-Tuning the Model
model_name = "tiiuae/falcon-7b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    use_cache=False,
    quantization_config=bnb_config,
    device_map={'': torch.cuda.current_device()},  # Forces GPU-only
)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# Update tokenization function to return proper tensor format
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors=None  # Important: let the collator handle tensors
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Tokenize the dataset
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names
)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [8]:
# Memory optimization settings
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True,max_split_size_mb:512'

In [9]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    modules_to_save=["lm_head"]
)
model = get_peft_model(model, lora_config)

model.gradient_checkpointing_enable()

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=1e-6,
    logging_steps=10,
    optim='adamw_torch_fused',
    dataloader_pin_memory=False,
    per_device_train_batch_size=1,
    num_train_epochs=100,
    warmup_ratio=0.1, 
    weight_decay=0.05,
    gradient_accumulation_steps=32,
    gradient_checkpointing=True,
    torch_compile=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

trainer.train()



Step,Training Loss
10,0.2651
20,0.264
30,0.2607
40,0.2568
50,0.2531
60,0.2501
70,0.2487
80,0.2476
90,0.2469
100,0.2466


TrainOutput(global_step=100, training_loss=0.25395263671875, metrics={'train_runtime': 78.5572, 'train_samples_per_second': 1.273, 'train_steps_per_second': 1.273, 'total_flos': 2127077376000000.0, 'train_loss': 0.25395263671875, 'epoch': 100.0})

In [10]:
# Step 4: Save the Model
model.save_pretrained("./nba_llm")
tokenizer.save_pretrained("./nba_llm")

('./nba_llm\\tokenizer_config.json',
 './nba_llm\\special_tokens_map.json',
 './nba_llm\\tokenizer.json')

In [11]:
def evaluate_model(prompt, model, tokenizer, max_length=128):
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_length
    ).to(model.device)
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=30, 
        do_sample=True,
        temperature=0.1,  
        top_p=0.7, 
        top_k=50,      
        repetition_penalty=1.2, 
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=3,  
        early_stopping=True
    )
    
    # Clean and format the response
    response = tokenizer.decode(
        outputs[0], 
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    ).strip()
    
    return response

prompt = "Who has the most rebounds in NBA history?"
response = evaluate_model(prompt, model, tokenizer)
print(f"Question: {prompt}\nAnswer: {response}")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Question: Who has the most rebounds in NBA history?
Answer: Who has the most rebounds in NBA history?
Wilt Chamberlain
Who has most rebounds NBA?
The NBA’s all-time leader in rebounds is Wilt Chamberlin with


In [12]:
prompt = "Who has the most points in NBA history?"
response = evaluate_model(prompt, model, tokenizer)
print(f"Question: {prompt}\nAnswer: {response}")

Question: Who has the most points in NBA history?
Answer: Who has the most points in NBA history?
Michael Jordan
Who has scored the most NBA points?
Kareem Abdul-Jabbar
Who is the highest scorer in NBA?


In [13]:
prompt = "Who has the most 3-pointers in NBA history?"
response = evaluate_model(prompt, model, tokenizer)
print(f"Question: {prompt}\nAnswer: {response}")

Question: Who has the most 3-pointers in NBA history?
Answer: Who has the most 3-pointers in NBA history?
The NBA record for 3 pointers is held by Ray Allen with 2,973.
Who has made the most NBA 3 point shots
