In [None]:
# Imprimir nombres de columnas en ambos DataFrames para verificar
print("Columnas en 'games':", games.columns.tolist())
print("Columnas en 'game_info':", game_info.columns.tolist())

# También, verifica si la columna 'game_date' existe después de la fusión
games_full = games.merge(game_info, on='game_id', how='left')
print("Columnas en 'games_full' después de la fusión:", games_full.columns.tolist())


Columnas en 'games': ['season_id', 'team_id_home', 'team_abbreviation_home', 'team_name_home', 'game_id', 'game_date', 'matchup_home', 'wl_home', 'min', 'fgm_home', 'fga_home', 'fg_pct_home', 'fg3m_home', 'fg3a_home', 'fg3_pct_home', 'ftm_home', 'fta_home', 'ft_pct_home', 'oreb_home', 'dreb_home', 'reb_home', 'ast_home', 'stl_home', 'blk_home', 'tov_home', 'pf_home', 'pts_home', 'plus_minus_home', 'video_available_home', 'team_id_away', 'team_abbreviation_away', 'team_name_away', 'matchup_away', 'wl_away', 'fgm_away', 'fga_away', 'fg_pct_away', 'fg3m_away', 'fg3a_away', 'fg3_pct_away', 'ftm_away', 'fta_away', 'ft_pct_away', 'oreb_away', 'dreb_away', 'reb_away', 'ast_away', 'stl_away', 'blk_away', 'tov_away', 'pf_away', 'pts_away', 'plus_minus_away', 'video_available_away', 'season_type']
Columnas en 'game_info': ['game_id', 'game_date', 'attendance', 'game_time']
Columnas en 'games_full' después de la fusión: ['season_id', 'team_id_home', 'team_abbreviation_home', 'team_name_home', 'ga

In [6]:
import pandas as pd

# Cargar datos
games = pd.read_csv('C:\\Users\\carlo\\Documents\\Proyectos\\NBA_predict\\csv\\game.csv')
game_info = pd.read_csv('C:\\Users\\carlo\\Documents\\Proyectos\\NBA_predict\\csv\\game_info.csv')

# Fusionar DataFrames
games_full = games.merge(game_info, on='game_id', how='left', suffixes=('_x', '_y'))

# Convertir 'game_date_x' o 'game_date_y' a datetime
games_full['game_date'] = pd.to_datetime(games_full['game_date_x'])  # Asume que ambas columnas son intercambiables

# Ordenar por fecha
games_full.sort_values(by='game_date', inplace=True)

# Suponer que 'wl_home' indica si el equipo local ganó o perdió
games_full['target'] = (games_full['wl_home'] == 'W').astype(int)

# Seleccionar columnas relevantes para el modelo
model_data = games_full[['game_date', 'team_id_home', 'team_id_away', 'pts_home', 'pts_away', 'target']]

# Ejemplo de cómo proceder después
print(model_data.head())

# Si necesitas más ayuda con el manejo de datos o con el modelo, avísame.


   game_date  team_id_home  team_id_away  pts_home  pts_away  target
0 1946-11-01    1610610035    1610612752      66.0      68.0       0
1 1946-11-02    1610610034    1610610031      56.0      51.0       1
2 1946-11-02    1610610032    1610612738      59.0      53.0       1
3 1946-11-02    1610610025    1610612752      63.0      47.0       1
4 1946-11-02    1610610028    1610610036      33.0      50.0       0


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, Dataset

# Cargar datos
games = pd.read_csv('C:\\Users\\carlo\\Documents\\Proyectos\\NBA_predict\\csv\\game.csv')
game_info = pd.read_csv('C:\\Users\\carlo\\Documents\\Proyectos\\NBA_predict\\csv\\game_info.csv')

# Fusionar DataFrames
games_full = games.merge(game_info, on='game_id', how='left', suffixes=('_x', '_y'))

# Convertir 'game_date_x' a datetime y seleccionar columnas relevantes
games_full['game_date'] = pd.to_datetime(games_full['game_date_x'])
games_full.sort_values(by='game_date', inplace=True)
features = games_full[['team_abbreviation_home', 'team_abbreviation_away']]
target = (games_full['wl_home'] == 'W').astype(int)

# Dividir los datos
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Preparar secuencias para Transformer usando el tokenizador de BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_features(df):
    input_texts = df['team_abbreviation_home'] + " vs " + df['team_abbreviation_away']
    inputs = tokenizer(input_texts.tolist(), padding='max_length', max_length=20, truncation=True, return_tensors="pt")
    return inputs

# Dataset personalizado para manejar los datos
class GameDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.features.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        return item

# Codificar las características
train_features = encode_features(X_train)
test_features = encode_features(X_test)

train_dataset = GameDataset(train_features, y_train)
test_dataset = GameDataset(test_features, y_test)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Cargar el modelo de Transformers
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Optimizador
optimizer = AdamW(model.parameters(), lr=2e-5)

# Entrenamiento del modelo
model.train()
for epoch in range(3):  # Loop over the dataset multiple times
    for i, batch in enumerate(train_loader, 0):
        inputs = {'input_ids': batch['input_ids'].to(device), 'attention_mask': batch['attention_mask'].to(device), 'labels': batch['labels'].to(device)}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        if i % 10 == 0:  # Print every 10 mini-batches
            print(f'[{epoch + 1}, {i + 1}] loss: {loss.item()}')

print('Finished Training')

# Evaluar el modelo
model.eval()  # Cambiar el modelo a modo de evaluación
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in test_loader:
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device),
            'labels': batch['labels'].to(device)
        }
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        labels = inputs['labels'].cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels)

# Calcular la precisión
accuracy = accuracy_score(all_labels, all_preds)
print(f'Accuracy: {accuracy:.4f}')

# Reporte de clasificación
print(classification_report(all_labels, all_preds, target_names=['Loss', 'Win']))

# Guardar el modelo
model_save_path = 'nba_bert_model.pth'
torch.save(model.state_dict(), model_save_path)
print(f'Model saved to {model_save_path}')

# Extraer y mostrar información del modelo
print("\nModel Information:")
print(f"Model architecture: {model}")
print(f"Number of parameters: {sum(p.numel() for p in model.parameters())}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")
print(f"Device: {device}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


[1, 1] loss: 0.8199509382247925
[1, 11] loss: 0.7678422927856445
[1, 21] loss: 0.6669143438339233
[1, 31] loss: 0.6967447996139526
[1, 41] loss: 0.7156301736831665
[1, 51] loss: 0.6641519069671631
[1, 61] loss: 0.6800936460494995
[1, 71] loss: 0.6790087223052979
[1, 81] loss: 0.6683835387229919
[1, 91] loss: 0.6334377527236938
[1, 101] loss: 0.6835892796516418
[1, 111] loss: 0.7137469053268433
[1, 121] loss: 0.6832860112190247
[1, 131] loss: 0.6827179789543152
[1, 141] loss: 0.6313008069992065
[1, 151] loss: 0.6671985387802124
[1, 161] loss: 0.652488648891449
[1, 171] loss: 0.6942811012268066
[1, 181] loss: 0.6888742446899414
[1, 191] loss: 0.7144515514373779
[1, 201] loss: 0.6624429225921631
[1, 211] loss: 0.6213627457618713
[1, 221] loss: 0.6352006793022156
[1, 231] loss: 0.6736316084861755
[1, 241] loss: 0.6776243448257446
[1, 251] loss: 0.6386815309524536
[1, 261] loss: 0.6607494354248047
[1, 271] loss: 0.6694372296333313
[1, 281] loss: 0.668586790561676
[1, 291] loss: 0.6309654116

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model saved to nba_bert_model.pth

Model Information:
Model architecture: BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_fe