In [None]:
import pandas as pd
import datetime
import glob
import os
import openai
import re
from pydantic import BaseModel
from openai import OpenAI
import json
import os
from dotenv import load_dotenv

load_dotenv()
github_token = os.getenv('GITHUB_TOKEN')
if not github_token:
    raise ValueError("GITHUB_TOKEN not found in .env file")

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
if not github_token:
    raise ValueError("OPENAI_API_KEY not found in .env file")

OPENAI_MODEL = "gpt-4o-2024-08-06"

repo_owner = 'OpenBB-finance'
repo_name = 'OpenBB'

client = OpenAI(
    api_key=OPENAI_API_KEY
)

folder_name = f"{repo_owner}_{repo_name}"

list_of_files = glob.glob(os.path.join(folder_name, 'stargazer_info_*.csv'))
latest_file = max(list_of_files, key=os.path.getctime) if list_of_files else None

class BioInformation(BaseModel):
    location: str | None
    company: str | None
    job_role: str | None

def extract_data_from_bio(bio):
    prompt = f"""
    The following user bio may contain user's location (include country, state and city if present), company they work at and their role. Here's the bio:
    "{bio}"
    
    Please return the location, company and job role.
    """
    
    completion = client.beta.chat.completions.parse(
        model=OPENAI_MODEL,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that extracts information from text."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        response_format=BioInformation
    )
    
    response_content = completion.choices[0].message.parsed
    
    return response_content.location, response_content.company, response_content.job_role


class LocationInformation(BaseModel):
    country: str | None
    state_region_city: str | None

def standardize_location(location):
    prompt = f"""
    Standardize this location into country and region/state/city format: "{location}"
    If it's a US location, return "USA, State"
    If it's another country, return "Country, City/Region" if available.
    If only country is available return "Country, None".
    If only region/city is available, infer the country and return "Country, Region/City".
    If the location is unclear or invalid, return "None, None"
    
    Examples:
    "New york" -> "USA, New York"
    "NYC" -> "USA, New York"
    "Brooklyn" -> "USA, New York"
    "London, UK" -> "United Kingdom, London"
    "Tokyo" -> "Japan, Tokyo"
    "France" -> "France, None"
    "Lisbon" -> "Portugal, Lisbon"
    "abcdef" -> "None, None"
    """
    
    completion = client.beta.chat.completions.parse(
        model=OPENAI_MODEL,
        messages=[
            {"role": "system", "content": "You are a location standardization assistant. Respond only with the standardized location or None."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        response_format=LocationInformation
    )
    
    response_content = completion.choices[0].message.parsed
    return response_content.country, response_content.state_region_city

class CompanyName(BaseModel):
    company: str | None

def standardize_company(company_name: str, existing_companies: list[str]) -> str:        
    prompt = f"""
    Compare this company name: "{company_name}" with the following list of existing company names:
    {existing_companies}

    Rules:
    1. If the company name is similar to an existing one (e.g., "Google Inc" vs "Google" or "Meta" vs "Facebook"), return the existing format
    2. If it's a new company, return the most formal version of the name
    3. Ignore case when comparing
    4. If the input is not a valid company name, return None

    Return the standardized name.
    """
    
    completion = client.beta.chat.completions.parse(
        model=OPENAI_MODEL,
        messages=[
            {"role": "system", "content": "You are a company name standardization assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        response_format=CompanyName
    )
    
    response_content = completion.choices[0].message.parsed
    return response_content.company

# Read the CSV file
df = pd.read_csv(latest_file)

# Create a new column for 'Job' in the dataframe
df['Job'] = ''

## Part 1 - Extract data from the bio
print("Part 1 - Extract data from the bio")
for index, row in df.iterrows():
    # Check that the bio is not empty, there might be important information there
    if not pd.isna(row['Bio']):
        location, company, job_role = extract_data_from_bio(row['Bio'])

        # Check if the user didn't define their location, and if there's been a location found on the bio
        if pd.isna(row['Location']) and location:
            df.at[index, 'Location'] = location

        # Check if the user didn't define their company, and if there's been a company found on the bio
        if pd.isna(row['Company']) and company:
            df.at[index, 'Company'] = company

        # Check if a job role has been found on the bio of the user
        if job_role:
            df.at[index, 'Job'] = job_role

## Part 2 - Standardize location into Country and State/City
print("Part 2 - Standardize location into Country and State/City")
df[['Country', 'Region']] = pd.DataFrame(
    df['Location'].apply(standardize_location).tolist(), 
    index=df.index
)

## Part 3 - Standardize company names
print("Part 3 - Standardize company names")
known_companies = set()
for index, row in df.iterrows():
    if row["Company"]:
        standardized_company = standardize_company(company, list(known_companies))
        if standardized_company:
            df.at[index, 'StandardCompany'] = standardized_company
            if standardized_company not in known_companies:
                known_companies.add(standardized_company)

# Save the updated dataframe
df.to_csv(latest_file.replace('.csv', '_processed.csv'), index=False)

print("Dataframe processing complete. Updated CSV saved.")