In [1]:
import time
import json
import re
import os
import sys
# Add the project root directory to sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # Adjust to your project's structure
sys.path.append(project_root)

import pandas as pd
from openai import OpenAI

from src.analysis import *

In [2]:
def append_output_to_file(output):
    """
    Appends the given output to a file with a separator.

    Parameters:
    - output (str): The text to append to the file.
    - file_path (str): Path to the file where the output will be appended.

    Returns:
    - None
    """
    dump_file = '/Users/cb/src/nba_mvp_ml/data/gpt-4o-output-mvps.txt'
    
    separator = "\n" + "-" * 80 + "\n"  # Line of dashes for separation
    with open(dump_file, "a", encoding="utf-8") as file:
        file.write(output + separator)

In [3]:
def write_season(year):
    return f'{year}-{str(year+1)[2:]}'

def setup_openai_client(api_key_path, endpoint, model_name):
    """
    Sets up the OpenAI client using the provided API key and endpoint.

    Args:
    api_key_path (str): Path to the file containing the API key.
    endpoint (str): The base URL for the OpenAI API endpoint.
    model_name (str): The name of the model to use.

    Returns:
    tuple: A tuple containing the initialized OpenAI client and the model name.
    """
    try:
        # Read the API key from the file
        with open(api_key_path, "r") as file:
            api_key = file.read().strip()
        
        # Initialize the OpenAI client
        client = OpenAI(
            # base_url=endpoint,
            api_key=api_key,
        )
        
        return client, model_name

    except FileNotFoundError as e:
        raise FileNotFoundError(f"API key file not found: {e.filename}")
    except Exception as e:
        raise RuntimeError(f"Failed to set up OpenAI client: {str(e)}")

def load_json_files(prompts_path, role_path):
    """
    Reads two JSON files and returns their contents as dictionaries.
    
    Args:
    prompts_path (str): Path to the JSON file containing GPT prompts.
    role_path (str): Path to the JSON file containing GPT role configuration.
    
    Returns:
    tuple: A tuple containing two dictionaries, `gpt_prompts` and `gpt_role`.
    """
    try:
        # Read the prompts JSON file
        with open(prompts_path, 'r', encoding='utf-8') as file:
            gpt_prompts = json.load(file)
        
        # Read the role JSON file
        with open(role_path, 'r', encoding='utf-8') as file:
            gpt_role = json.load(file)
        
        return gpt_prompts, gpt_role
    
    except FileNotFoundError as e:
        raise FileNotFoundError(f"File not found: {e.filename}")
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON format in file: {e.msg}")

def extract_rating(input_string):
    """
    Extracts the first single digit in the format [[X / Y]] from the input string.
    
    Args:
    input_string (str): The input string containing the rating.
    
    Returns:
    int: The extracted single digit as an integer.
    """
    match = re.search(r'\[\s*\[\s*(\-*\d+\.*\d*)\s*/\s*\d+\s*\]\s*\]', input_string)
    if match:
        return float(match.group(1))
    else:
        print(input_string)
        print("Rating in the format [[X / Y]] not found in the input string.")
        return -1

def add_sentiment_avg(df):
    """
    Adds a column `sentiment_avg` to the DataFrame, which is the average
    of numeric data from the specified columns for each row.

    Parameters:
    df (pd.DataFrame): Input DataFrame

    Returns:
    pd.DataFrame: Updated DataFrame with the new `sentiment_avg` column
    """

    sentiment_cols = [
    'sentiment_1', 'sentiment_2', 'sentiment_3', 'sentiment_4',
    'sentiment_5', 'sentiment_6', 'sentiment_7', 'sentiment_8',
    'sentiment_9', 'sentiment_10', 'sentiment_11', 'sentiment_12',
    'sentiment_13', 'sentiment_14', 'sentiment_15'
]
    df["sentiment_avg"] = df[sentiment_cols].mean(axis=1)
    return df


def analyze_sentiment(client, model_name, role, prompt, year, player_name, temperature=1.0, top_p=1.0, max_tokens=1000):
    content = f'Consider the {year}-{str(year+1)[2:]} NBA season.  Give your response with respect to {player_name}. {prompt['prompt']}.'

    
    try:
        response = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": role,
                },
                {
                    "role": "user",
                    "content": content,
                }
            ],
            temperature=temperature,
            top_p=top_p,
            max_tokens=max_tokens,
            model=model_name
        )
    except Exception as e:
        print (f"Failed to receive response from OpenAI client with error: {str(e)}")    
        
    text = response.choices[0].message.content
    append_output_to_file(f'{write_season(year)} {player_name}\n{prompt['title']}: {text}')
    rating = extract_rating(text)

    print(f'{write_season(year)}: {player_name} - {prompt['title']}: {rating}')
        
    return rating
        
def tell_mvp_story(client, model_name, role, prompts, year, player_name, temperature=1.0, top_p=1.0, max_tokens=1000, sleep=7):
    ratings = {
        player_name:{}
    }
    
    for prompt in prompts.keys():
        rating = analyze_sentiment(
            client=client,                     
            model_name=model_name,                     
            role=role,                     
            prompt=prompts[prompt],   
            year=year,
            player_name=player_name,                     
            temperature=temperature,                     
            top_p=top_p,                     
            max_tokens=max_tokens
        )

        ratings[player_name][prompt] = rating

        time.sleep(sleep) # Accomodate request limit ~10/min

    return ratings

def process_mvp_stories_for_year(client, model_name, role, prompts, year, df, temperature=1.0, top_p=1.0, max_tokens=1000, sleep=7):
    """
    Processes the MVP story for the first five players in the DataFrame and
    writes the sentiment ratings into new columns for each prompt key.

    Parameters:
    - client (object): ChatGPT client instance.
    - model_name (str): The model name to use for predictions.
    - role (str): System role for ChatGPT.
    - prompts (dict): Dictionary of prompts.
    - year (int): NBA season year (e.g., 2023 for the 2023-24 season).
    - temperature (float): Temperature parameter for ChatGPT.
    - top_p (float): Top-p parameter for ChatGPT.
    - max_tokens (int): Maximum tokens for each ChatGPT response.
    - sleep (int): Sleep duration between requests to avoid rate limits.

    Saves:
    - Overwrites the CSV with new sentiment columns added.
    """

    # Initialize new columns for each prompt key
    for prompt_key in prompts.keys():
        df[f"sentiment_{prompt_key}"] = 0  # Default to 0 for all rows

    # Process only the top 5 players based on the 'Rank' column
    for index, row in df.iterrows():
        if index < 7:  # Only process the first 7 rows
            player_name = row['Player']
            ratings = tell_mvp_story(
                client=client,
                model_name=model_name,
                role=role,
                prompts=prompts,
                year=year,
                player_name=player_name,
                temperature=temperature,
                top_p=top_p,
                max_tokens=max_tokens,
                sleep=sleep
            )

            # Add ratings for each prompt key to the corresponding column
            for prompt_key, rating in ratings[player_name].items():
                df.at[index, f"sentiment_{prompt_key}"] = rating

            df = add_sentiment_avg(df)
        else:
            # Skip processing for rows beyond the top 5
            break

    # Save the modified DataFrame back to the file
    file_path = f'/Users/cb/src/nba_mvp_ml/data/processed/by_season/mvp/sentiment/mvp_{year}-{str(year+1)[2:]}.csv'
    df.to_csv(file_path, index=False)
    print(f"Updated DataFrame saved to {file_path}")

In [4]:
# JSON Prompts
prompts_path = '/Users/cb/src/nba_mvp_ml/json/mvp-qualitative.json'
role_path = '/Users/cb/src/nba_mvp_ml/json/mvp-role.json'

gpt_prompts, gpt_role = load_json_files(prompts_path, role_path)

# OpenAI Client
# api_key_path = "/Users/cb/src/gpt-4o-api-key-expires-02-25.txt"
api_key_path = "/Users/cb/src/nba_mvp_ml-key-paid.txt"

endpoint = "https://models.inference.ai.azure.com"
model_name = "gpt-4o"

client, model_name = setup_openai_client(api_key_path, endpoint, model_name)    

In [5]:
for year in list(range(1980,2024)) :
    player_df, team_df, mvp_df = load_year(year, mvp_path='/Users/cb/src/nba_mvp_ml/data/processed/by_season/mvp')

    process_mvp_stories_for_year(
        client=client, 
        model_name=model_name, 
        role=gpt_role['role'], 
        prompts=gpt_prompts, 
        year=year, 
        df=mvp_df,
        temperature=1.0, 
        top_p=1.0, 
        max_tokens=1000,
        sleep=1
    )

1980-81: JULIUS ERVING - Team Success and Playoff Position: 8.0
1980-81: JULIUS ERVING - Impact on Winning: 9.0
1980-81: JULIUS ERVING - Narratives and Storylines: 8.0
1980-81: JULIUS ERVING - Voter Fatigue: 2.0
1980-81: JULIUS ERVING - Clutch Performances: 7.0
1980-81: JULIUS ERVING - Media and Fan Sentiment: 6.0
1980-81: JULIUS ERVING - Leadership and Intangibles: 9.0
1980-81: JULIUS ERVING - Historical Significance: 8.0
1980-81: JULIUS ERVING - Defying Expectations: 7.0
1980-81: JULIUS ERVING - Competitor Context: 8.0
1980-81: JULIUS ERVING - Defensive Impact: 7.0
1980-81: JULIUS ERVING - Market Size and Visibility: 6.0
1980-81: JULIUS ERVING - Postseason Expectations: 7.0
1980-81: JULIUS ERVING - Role and Usage: 9.0
1980-81: JULIUS ERVING - Player Popularity: 8.0
1980-81: LARRY BIRD - Team Success and Playoff Position: 8.0
1980-81: LARRY BIRD - Impact on Winning: 8.0
1980-81: LARRY BIRD - Narratives and Storylines: 6.0
1980-81: LARRY BIRD - Voter Fatigue: 0.0
1980-81: LARRY BIRD - 


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '8.5' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.



1982-83: SIDNEY MONCRIEF - Team Success and Playoff Position: 6.0
1982-83: SIDNEY MONCRIEF - Impact on Winning: 8.0
1982-83: SIDNEY MONCRIEF - Narratives and Storylines: 4.0
1982-83: SIDNEY MONCRIEF - Voter Fatigue: 0.0
1982-83: SIDNEY MONCRIEF - Clutch Performances: 6.0
1982-83: SIDNEY MONCRIEF - Media and Fan Sentiment: 4.0
1982-83: SIDNEY MONCRIEF - Leadership and Intangibles: 6.0
1982-83: SIDNEY MONCRIEF - Historical Significance: 5.0
1982-83: SIDNEY MONCRIEF - Defying Expectations: 6.0
1982-83: SIDNEY MONCRIEF - Competitor Context: 8.0
1982-83: SIDNEY MONCRIEF - Defensive Impact: 7.0
1982-83: SIDNEY MONCRIEF - Market Size and Visibility: 6.0
1982-83: SIDNEY MONCRIEF - Postseason Expectations: 4.0
1982-83: SIDNEY MONCRIEF - Role and Usage: 5.0
1982-83: SIDNEY MONCRIEF - Player Popularity: 3.0
1982-83: JULIUS ERVING - Team Success and Playoff Position: 8.0
1982-83: JULIUS ERVING - Impact on Winning: 9.0
1982-83: JULIUS ERVING - Narratives and Storylines: 7.0
1982-83: JULIUS ERVING -


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '6.5' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.



2008-09: DWYANE WADE - Team Success and Playoff Position: 7.0
2008-09: DWYANE WADE - Impact on Winning: 9.0
2008-09: DWYANE WADE - Narratives and Storylines: 8.0
2008-09: DWYANE WADE - Voter Fatigue: 0.0
2008-09: DWYANE WADE - Clutch Performances: 8.0
2008-09: DWYANE WADE - Media and Fan Sentiment: 7.0
2008-09: DWYANE WADE - Leadership and Intangibles: 8.0
2008-09: DWYANE WADE - Historical Significance: 8.0
2008-09: DWYANE WADE - Defying Expectations: 9.0
2008-09: DWYANE WADE - Competitor Context: 7.0
2008-09: DWYANE WADE - Defensive Impact: 7.0
2008-09: DWYANE WADE - Market Size and Visibility: 6.0
2008-09: DWYANE WADE - Postseason Expectations: 6.0
2008-09: DWYANE WADE - Role and Usage: 9.0
2008-09: DWYANE WADE - Player Popularity: 6.0
2008-09: DWIGHT HOWARD - Team Success and Playoff Position: 6.0
2008-09: DWIGHT HOWARD - Impact on Winning: 8.0
2008-09: DWIGHT HOWARD - Narratives and Storylines: 6.0
2008-09: DWIGHT HOWARD - Voter Fatigue: 0.0
2008-09: DWIGHT HOWARD - Clutch Performa


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '9.5' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.



2009-10: KEVIN DURANT - Team Success and Playoff Position: 7.0
2009-10: KEVIN DURANT - Impact on Winning: 7.0
2009-10: KEVIN DURANT - Narratives and Storylines: 7.0
2009-10: KEVIN DURANT - Voter Fatigue: 6.0
2009-10: KEVIN DURANT - Clutch Performances: 7.0
2009-10: KEVIN DURANT - Media and Fan Sentiment: 6.0
2009-10: KEVIN DURANT - Leadership and Intangibles: 5.0
2009-10: KEVIN DURANT - Historical Significance: 6.0
2009-10: KEVIN DURANT - Defying Expectations: 8.0
2009-10: KEVIN DURANT - Competitor Context: 8.0
2009-10: KEVIN DURANT - Defensive Impact: 3.0
2009-10: KEVIN DURANT - Market Size and Visibility: 5.0
2009-10: KEVIN DURANT - Postseason Expectations: 4.0
2009-10: KEVIN DURANT - Role and Usage: 7.0
2009-10: KEVIN DURANT - Player Popularity: 6.0
2009-10: KOBE BRYANT - Team Success and Playoff Position: 8.0
2009-10: KOBE BRYANT - Impact on Winning: 9.0
2009-10: KOBE BRYANT - Narratives and Storylines: 8.0
2009-10: KOBE BRYANT - Voter Fatigue: 6.0
2009-10: KOBE BRYANT - Clutch Per


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '9.5' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.



2020-21: JOEL EMBIID - Team Success and Playoff Position: 7.0
2020-21: JOEL EMBIID - Impact on Winning: 8.0
2020-21: JOEL EMBIID - Narratives and Storylines: 7.0
2020-21: JOEL EMBIID - Voter Fatigue: 1.0
2020-21: JOEL EMBIID - Clutch Performances: 7.0
2020-21: JOEL EMBIID - Media and Fan Sentiment: 6.0
2020-21: JOEL EMBIID - Leadership and Intangibles: 8.0
2020-21: JOEL EMBIID - Historical Significance: 6.0
2020-21: JOEL EMBIID - Defying Expectations: 7.0
2020-21: JOEL EMBIID - Competitor Context: 7.0
2020-21: JOEL EMBIID - Defensive Impact: 7.0
2020-21: JOEL EMBIID - Market Size and Visibility: 7.0
2020-21: JOEL EMBIID - Postseason Expectations: 6.0
2020-21: JOEL EMBIID - Role and Usage: 9.0
2020-21: JOEL EMBIID - Player Popularity: 6.0
2020-21: STEPHEN CURRY - Team Success and Playoff Position: 7.0
2020-21: STEPHEN CURRY - Impact on Winning: 9.0
2020-21: STEPHEN CURRY - Narratives and Storylines: 9.0
2020-21: STEPHEN CURRY - Voter Fatigue: 3.0
2020-21: STEPHEN CURRY - Clutch Performa


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '6.5' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.



2020-21: DAMIAN LILLARD - Team Success and Playoff Position: 6.0
2020-21: DAMIAN LILLARD - Impact on Winning: 8.0
2020-21: DAMIAN LILLARD - Narratives and Storylines: 7.0
2020-21: DAMIAN LILLARD - Voter Fatigue: 1.0
2020-21: DAMIAN LILLARD - Clutch Performances: 9.0
2020-21: DAMIAN LILLARD - Media and Fan Sentiment: 6.0
2020-21: DAMIAN LILLARD - Leadership and Intangibles: 8.0
2020-21: DAMIAN LILLARD - Historical Significance: 6.0
2020-21: DAMIAN LILLARD - Defying Expectations: 7.0
2020-21: DAMIAN LILLARD - Competitor Context: 8.0
2020-21: DAMIAN LILLARD - Defensive Impact: 3.0
2020-21: DAMIAN LILLARD - Market Size and Visibility: 5.0
2020-21: DAMIAN LILLARD - Postseason Expectations: 4.0
2020-21: DAMIAN LILLARD - Role and Usage: 8.0
2020-21: DAMIAN LILLARD - Player Popularity: 6.0
Updated DataFrame saved to /Users/cb/src/nba_mvp_ml/data/processed/by_season/mvp/sentiment/mvp_2020-21.csv
2021-22: NIKOLA JOKIĆ - Team Success and Playoff Position: 7.0
2021-22: NIKOLA JOKIĆ - Impact on Win

In [6]:
mvp_df.columns

Index(['Rank', 'Player', 'Age', 'Tm', 'First', 'Pts Won', 'Pts Max', 'Share',
       'G', 'MP', 'PTS', 'TRB', 'AST', 'STL', 'BLK', 'FG%', '3P%', 'FT%', 'WS',
       'WS/48', 'sentiment_1', 'sentiment_2', 'sentiment_3', 'sentiment_4',
       'sentiment_5', 'sentiment_6', 'sentiment_7', 'sentiment_8',
       'sentiment_9', 'sentiment_10', 'sentiment_11', 'sentiment_12',
       'sentiment_13', 'sentiment_14', 'sentiment_15', 'sentiment_avg'],
      dtype='object')

In [8]:
player_df.columns

Index(['Unnamed: 0', 'PLAYER_ID', 'PLAYER_FULLNAME', 'SEASON_ID', 'LEAGUE_ID',
       'TEAM_ID', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM',
       'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV',
       'PF', 'PTS', 'MIN_PG', 'FGM_PG', 'FGA_PG', 'FG_PCT_PG', 'FG3M_PG',
       'FG3A_PG', 'FG3_PCT_PG', 'FTM_PG', 'FTA_PG', 'FT_PCT_PG', 'OREB_PG',
       'DREB_PG', 'REB_PG', 'AST_PG', 'STL_PG', 'BLK_PG', 'TOV_PG', 'PF_PG',
       'PTS_PG', 'TS%', 'eFG%', 'Rk', 'GP', 'MIN', '3PAr', 'ASTPct', 'BLKPct',
       'BPM', 'DBPM', 'DRBPct', 'DWS', 'FTr', 'GS', 'OBPM', 'ORBPct', 'OWS',
       'PER', 'STLPct', 'TOVPct', 'TRBPct', 'TSPct', 'USGPct', 'VORP', 'WS/48',
       'Year', 'PLAYER_AGE', 'TEAM_ABBREVIATION', 'WS'],
      dtype='object')

In [14]:
def merge_dfs(player_df, team_df, mvp_df, include_non_mvp=False, debug=False):
    # Merge player and team data
    merged_df = pd.merge(player_df, team_df, on=['TEAM_ID', 'SEASON_ID'], suffixes=('_player', '_team'))

    # Add a column to differentiate MVP candidates
    merged_df['MVP_Candidate'] = merged_df['PLAYER_FULLNAME'].apply(
        lambda x: 'MVP Candidate' if x in mvp_df['Player'].values else 'Other'
    )

    # Merge MVP voting data into the player/team dataset
    merged_with_mvp = pd.merge(
        merged_df,
        mvp_df.drop(columns=['Age','Tm']),
        # mvp_df[['Player', 'Pts Won', 'Pts Max', 'Share', 
        #         'G', 'MP', 'PTS', 'TRB', 'AST', 'STL', 'BLK', 
        #         'FG%', '3P%', 'FT%', 'WS', 'WS/48',
        #         'sentiment_1', 'sentiment_2', 'sentiment_3', 'sentiment_4',
        #         'sentiment_5', 'sentiment_6', 'sentiment_7', 'sentiment_8',
        #         'sentiment_9', 'sentiment_10', 'sentiment_11', 'sentiment_12',
        #         'sentiment_13', 'sentiment_14', 'sentiment_15']],  # Select key MVP metrics
        how='left',
        left_on='PLAYER_FULLNAME',
        right_on='Player'
    )

    # Replace metrics with MVP data where available, with fallback logic
    merged_with_mvp['WS'] = merged_with_mvp['WS_y'].fillna(merged_with_mvp['WS_x'])
    merged_with_mvp['PTS'] = merged_with_mvp['PTS'].fillna(merged_with_mvp['PTS_player'])
    merged_with_mvp['TRB'] = merged_with_mvp['TRB_y'].fillna(merged_with_mvp['REB'])
    merged_with_mvp['AST'] = merged_with_mvp.get('AST_y', merged_with_mvp['AST_player'])  # Fallback if 'AST_y' is missing

    # Optionally filter out non-MVP candidates
    if not include_non_mvp:
        merged_with_mvp = merged_with_mvp[merged_with_mvp['MVP_Candidate'] != 'Other']
    
    if debug:
        display(sorted(list(merged_with_mvp.columns)))
    
    # Drop unnecessary columns and avoid confusion between suffixes
    merged_with_mvp = merged_with_mvp.rename(columns={'Pts Won': 'Pts_Won'})

    return merged_with_mvp

list(merge_dfs(*load_year(2021)).columns)




['Unnamed: 0',
 'PLAYER_ID',
 'PLAYER_FULLNAME',
 'SEASON_ID',
 'LEAGUE_ID',
 'TEAM_ID',
 'FGM',
 'FGA_player',
 'FG_PCT',
 'FG3M',
 'FG3A',
 'FG3_PCT',
 'FTM',
 'FTA_player',
 'FT_PCT',
 'OREB',
 'DREB',
 'REB',
 'AST_player',
 'STL_player',
 'BLK_player',
 'TOV_player',
 'PF_player',
 'PTS_player',
 'MIN_PG',
 'FGM_PG',
 'FGA_PG',
 'FG_PCT_PG',
 'FG3M_PG',
 'FG3A_PG',
 'FG3_PCT_PG',
 'FTM_PG',
 'FTA_PG',
 'FT_PCT_PG',
 'OREB_PG',
 'DREB_PG',
 'REB_PG',
 'AST_PG',
 'STL_PG',
 'BLK_PG',
 'TOV_PG',
 'PF_PG',
 'PTS_PG',
 'TS%_player',
 'eFG%_player',
 'Rk',
 'GP',
 'MIN',
 '3PAr_player',
 'ASTPct',
 'BLKPct',
 'BPM',
 'DBPM',
 'DRBPct',
 'DWS',
 'FTr_player',
 'GS',
 'OBPM',
 'ORBPct',
 'OWS',
 'PER',
 'STLPct',
 'TOVPct',
 'TRBPct',
 'TSPct',
 'USGPct',
 'VORP',
 'WS/48_x',
 'Year',
 'PLAYER_AGE',
 'TEAM_ABBREVIATION_player',
 'WS_x',
 'Rk_trad',
 'Team',
 'G_x',
 'MP_x',
 'FG',
 'FGA_team',
 'FG%_x',
 '3P',
 '3PA',
 '3P%_x',
 '2P',
 '2PA',
 '2P%',
 'FT',
 'FTA_team',
 'FT%_x',
 'ORB',
