In [1]:
import pandas as pd
import time
import json
import textwrap

In [2]:
df = pd.read_csv('../../datasets/processed/Combined_Dataset.csv')
print(df.head())

   MatchID               Home                    Away  \
0        0       Swansea City              Stoke City   
1        2  Tottenham Hotspur          Leicester City   
2        3          Liverpool  Brighton & Hove Albion   
3        4            Burnley         AFC Bournemouth   
4        5        Southampton         Manchester City   

                        Date   Season  \
0  Fri 11 May 2018 17.23 BST  2017-18   
1  Fri 11 May 2018 16.43 BST  2017-18   
2  Fri 11 May 2018 16.33 BST  2017-18   
3  Fri 11 May 2018 16.23 BST  2017-18   
4  Fri 11 May 2018 16.07 BST  2017-18   

                                                Text              DateTime  \
0  A match many thought would have so much riding...  2018-05-13T00:00:00Z   
1  Leicester’s 3-1 win over Arsenal on Wednesday ...  2018-05-13T00:00:00Z   
2  Complaints over Liverpool’s recent league form...  2018-05-13T00:00:00Z   
3  Burnley are looking for their first win in fiv...  2018-05-13T00:00:00Z   
4  Manchester City’s

In [3]:
def get_match_info(match_df_idx):
    return {
        "Match_ID": df.iloc[match_df_idx]['MatchID'],
        "Home": df.iloc[match_df_idx]['Home'],
        "Away": df.iloc[match_df_idx]['Away'],
        "Preview": df.iloc[match_df_idx]['Text'],
        "Season": df.iloc[match_df_idx]['Season'],
    }

In [4]:
def get_time():
    return time.strftime('%H:%M:%S')

### Prompts

In [5]:
def create_prompt(match_df_idx, version=1):

    match_info = get_match_info(match_df_idx)
    home = match_info['Home']
    away = match_info['Away']
    preview = match_info['Preview']
    
    prompt = ""

    if version == 1:
        prompt += "You are tasked with reading a preview for a football match and determining based on the preview which team is being favored to win."
        prompt += f"Respond with the number based on the following scale that best matches the sentiment of the preview."
        prompt += f"If the home team is strongly favored to win, respond with 1."
        prompt += f"If the home team is slightly favored to win, respond with 2."
        prompt += f"If the match is balanced with no clear favor, respond with 3."
        prompt += f"If the away team is slightly favored to win, respond with 4."
        prompt += f"If the away team is strongly favored to win, respond with 5."
        prompt += f"The home team is {home} and the away team is {away}."
        prompt += f"The preview is: {preview}"

    elif version == 2:
        prompt += f"Is the home team {home} or the away team {away} favored to win the match based on the preview? "
        prompt += f"Use the underlying sentiment of the preview and understanding of football language and context. "
        prompt += f"If the home team {home} is strongly favored, respond '1'. "
        prompt += f"If the home team {home} is slightly favored, respond '2'. "
        prompt += f"If the match is balanced with no clear favorite, respond '3'. "
        prompt += f"If the away team {away} is slightly favored, respond '4'. "
        prompt += f"If the away team {away} is strongly favored, respond '5'. "
        prompt += f"Match preview: {preview}"
    
    elif version == 3:
        prompt += f"You are tasked with identifying if the home team {home} or the away team {away} is favored to win based on the match preview. "
        prompt += f"Do not use any historical performances or statistics to make your decision. "
        prompt += f"Just analyze the underlying sentiment of the preview and use understanding of football language and context. "
        prompt += f"If the home team {home} is strongly favored, respond '1'. "
        prompt += f"If the home team {home} is slightly favored, respond '2'. "
        prompt += f"If the match is balanced with no clear favorite, respond '3'. "
        prompt += f"If the away team {away} is slightly favored, respond '4'. "
        prompt += f"If the away team {away} is strongly favored, respond '5'. "
        prompt += f"Only respond with the number based on the scale that best matches the sentiment of the preview. "
        prompt += f"Although you should understand why you chose that number, you don't need to explain why. "
        prompt += f"Match preview: {preview}"
    
    return prompt

### OpenAI

In [6]:
from openai import OpenAI

In [7]:
openai_api_key_dir = f"../../../API Keys/openai.txt"
with open(openai_api_key_dir) as f:
    lines = f.readlines()
    openai_api_key = lines[0].strip()

In [8]:
client = OpenAI(api_key=openai_api_key)

In [9]:
def get_openai_response(prompt, model="gpt-4o-mini", preliminary_content_version=1):

    if preliminary_content_version == 1:
        preliminary_content = "You are an AI assistant that analyzes and classifies sentiment in match previews for predicting match outcomes for the Premier League."
    else:
        preliminary_content = "You are an AI assistant that analyzes and classifies sentiment in match previews for predicting match outcomes for the Premier League."

    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": preliminary_content},
            {"role": "user", "content": prompt}
        ]
    )

    return completion.choices[0].message.content

In [11]:
NUM_MODEL_SAMPLES = 5
MAX_ALLOWED_ATTEMPTS = 8

# Initialize model output json
model_output_dir = f"output/sentiment-analysis_gpt-4o-mini_prompt-v3.json"
model_output_json = json.load(open(model_output_dir))

# Loop through each match
for match_df_idx in range(30):

    # Get match information
    match_info = get_match_info(match_df_idx)
    match_id = match_info['Match_ID']
    match_id_str = str(match_id)
    home = match_info['Home']
    away = match_info['Away']
    season = match_info['Season']
    print(f"[{get_time()}] [Match ID {match_id}] Starting sentiment analysis for match ~ {home} vs {away}.")
    
    # Add match to model output json
    if match_id_str not in model_output_json:
        model_output_json[match_id_str] = []
    
    # Loop to get 5 responses
    attempts = 0
    while len(model_output_json[match_id_str]) < NUM_MODEL_SAMPLES:

        # Break early if too many attempts
        attempts += 1
        if attempts > MAX_ALLOWED_ATTEMPTS:
            print(f"[{get_time()}] [Match ID {match_id}] Max attempts ({attempts-1}) reached.")
            break

        # Get model output
        try:
            print(f"[{get_time()}] [Match ID {match_id}] Trying request attempt {attempts}.")
            prompt = create_prompt(match_df_idx, version=3)
            output = get_openai_response(prompt)
            output = output.strip()

            if len(output) > 1:
                print(f"[{get_time()}] [Match ID {match_id}] Issue with model output, longer than 1 character. ~ Output:\n{textwrap.fill(output, width=100)}")
                continue

            elif not(any(char in output for char in '12345')):
                print(f"[{get_time()}] [Match ID {match_id}] Issue with model output, No classification found in output. ~ Output:\n{textwrap.fill(output, width=100)}")
                continue

            # Add output to model output json
            model_output_json[match_id_str].append(output)
            with open(model_output_dir, 'w') as f:
                json.dump(model_output_json, f, indent=4)
            print(f"[{get_time()}] [Match ID {match_id}] Successfully recieved responce {len(model_output_json[match_id_str])}.")
        
        except Exception as e:
            print(f"[{get_time()}] [Match ID {match_id}] Error getting model output: {e}")
            
            # if "429" in str(e):
            #     print(f"[{get_time()}] [Match ID {match_id}] Pausing new requests for 1 minute.")
            #     time.sleep(60)

    print(f"\n[{get_time()}] [Match ID {match_id}] model_output_json[{match_id}]: {model_output_json[match_id_str]}\n")

print("\nFINAL RESULTS:")
for match_id_str in model_output_json:
    print(f"Match {match_id_str} output: {model_output_json[match_id_str]}")

[00:33:01] [Match ID 0] Starting sentiment analysis for match ~ Swansea City vs Stoke City.
[00:33:01] [Match ID 0] Trying request attempt 1.
[00:33:02] [Match ID 0] Successfully recieved responce 3.
[00:33:02] [Match ID 0] Trying request attempt 2.
[00:33:02] [Match ID 0] Successfully recieved responce 4.
[00:33:02] [Match ID 0] Trying request attempt 3.
[00:33:02] [Match ID 0] Successfully recieved responce 5.

[00:33:02] [Match ID 0] model_output_json[0]: ['3', '3', '3', '3', '3']

[00:33:02] [Match ID 2] Starting sentiment analysis for match ~ Tottenham Hotspur vs Leicester City.
[00:33:02] [Match ID 2] Trying request attempt 1.
[00:33:02] [Match ID 2] Successfully recieved responce 3.
[00:33:02] [Match ID 2] Trying request attempt 2.
[00:33:03] [Match ID 2] Successfully recieved responce 4.
[00:33:03] [Match ID 2] Trying request attempt 3.
[00:33:03] [Match ID 2] Successfully recieved responce 5.

[00:33:03] [Match ID 2] model_output_json[2]: ['3', '3', '3', '2', '3']

[00:33:03] 

### Gemini

In [None]:
import google.generativeai as genai

In [None]:
gemini_api_key_dir = f"../../../API Keys/gemini.txt"
with open(gemini_api_key_dir) as f:
    lines = f.readlines()
    gemini_api_key = lines[0].strip()

In [None]:
genai.configure(api_key=gemini_api_key)
model_gemini_15_flash = genai.GenerativeModel("gemini-1.5-flash")

In [None]:
# model_output = {
#     '0': ['3', '3', '3', '3', '3'],
#     '1': ['2', '2', '2', '2', '2'],
#     '2': ['2', '2', '2', '2', '1'],
#     '3': ['3', '3', '2', '2', '2'],
#     '4': ['5', '5', '5', '5', '5'],
#     '5': ['2', '4', '4', '4', '2']
# }

# # Test model consistency
# for match_idx in range(100):
#     match_idx_str = str(match_idx)

#     if match_idx_str not in model_output:
#         model_output[match_idx_str] = []
    
#     while len(model_output[match_idx_str]) < 5:
#         try:
#             print(f"[{time.strftime('%H:%M:%S')}] Trying request for match {match_idx}.")
#             prompt = create_prompt(match_idx, version=2)
#             response = model_gemini_15_flash.generate_content(prompt)
#             print(f"[{time.strftime('%H:%M:%S')}] Successfully responce for match {match_idx}.")
#             model_output[match_idx_str].append(response.text.strip())
#         except Exception as e:
#             print(f"[{time.strftime('%H:%M:%S')}] Error: {e}")
#             if "429" in str(e):
#                 print(f"[{time.strftime('%H:%M:%S')}] Pausing new requests for 1 minute.")
#                 time.sleep(60)
#             elif "504" in str(e):
#                 print(f"[{time.strftime('%H:%M:%S')}] Skipping {match_idx}.")

#     print(f"[{time.strftime('%H:%M:%S')}] Match {match_idx} output: {model_output[match_idx_str]}")

# print("\nFINAL RESULTS:")
# for match in model_output:
#     print(f"Match {match} output: {model_output[match]}")

In [None]:
# import concurrent.futures
# import threading
# import time

# model_output = {
#     '0': ['3', '3', '3', '3', '3'],
#     '1': ['2', '2', '2', '2', '2'],
#     '2': ['2', '2', '2', '2', '1'],
#     '3': ['3', '3', '2', '2', '2'],
#     '4': ['5', '5', '5', '5', '5'],
#     '5': ['2', '4', '4', '4', '2']
# }

# pause_event = threading.Event()
# pause_event.set()

# def fetch_responses(match_idx):
#     if match_idx not in model_output:
#         model_output[match_idx] = []
#     while len(model_output[match_idx]) < 5:
#         try:
#             pause_event.wait()
#             prompt = create_prompt(match_idx, version=2)
#             print(f"[{time.strftime('%H:%M:%S')}] Getting model response for match {match_idx}...")
#             response = model_gemini_15_flash.generate_content(prompt)
#             print(f"[{time.strftime('%H:%M:%S')}] Successfully recieved model response for match {match_idx}...")
#             model_output[match_idx].append(response.text.strip())
#         except Exception as e:
#             print(f"[{time.strftime('%H:%M:%S')}] Error: {e}")
#             if "429" in str(e):
#                 print(f"[{time.strftime('%H:%M:%S')}] Pausing new requests for 2 minutes.")
#                 pause_event.clear()
#                 time.sleep(120)
#                 pause_event.set()

# with concurrent.futures.ThreadPoolExecutor() as executor:
#     match_indices = range(8)
#     executor.map(fetch_responses, match_indices)

# for match_idx in model_output:
#     print(f"Match {match_idx} output: {model_output[match_idx]}")


In [None]:
# regions = [“europe-west4”, “us-east1”, “europe-north1”, “europe-west1”, “europe-west2”, “europe-west3”, “europe-west6”, “europe-central2”]
# random.shuffle(regions)
# gemini_models = [“gemini-1.5-flash-002”,“gemini-1.5-flash-002”, “gemini-1.5-pro-002”]
# random.shuffle(gemini_models)
# selected_region = regions[0]
# selected_model = gemini_models[0]