In [11]:
import os
import openai
import psycopg2
import pandas as pd
import pretty_errors
import timeit
import logging
from dotenv import load_dotenv


In [12]:
load_dotenv('.env')
openai.api_key = os.getenv("OPENAI_API_KEY")
user = os.getenv("user")
password = os.getenv("password")
host = os.getenv("host")
port = os.getenv("port")
database = os.getenv("database")
SAVE_PATH = os.getenv("SAVE_PATH")

In [13]:
logging.basicConfig(filename='/Users/juanreyesgarcia/Library/CloudStorage/OneDrive-FundacionUniversidaddelasAmericasPuebla/DEVELOPER/PROJECTS/DreamedJobAI/logs/LoggingTEST.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [14]:
def set_dataframe_display_options():
    pd.set_option('display.max_columns', None)  # Show all columns
    pd.set_option('display.max_rows', None)  # Show all rows
    pd.set_option('display.width', None)  # Disable column width restriction
    pd.set_option('display.expand_frame_repr', False)  # Disable wrapping to multiple lines
    pd.set_option('display.max_colwidth', None)  # Display full contents of each column

# Call the function to set the desired display options
set_dataframe_display_options()

In [20]:

df_final_user = pd.read_parquet('/Users/juanreyesgarcia/Library/CloudStorage/OneDrive-FundacionUniversidaddelasAmericasPuebla/DEVELOPER/PROJECTS/DreamedJobAI/data/final_user_df.parquet')
df_most_suitable = pd.read_parquet('/Users/juanreyesgarcia/Library/CloudStorage/OneDrive-FundacionUniversidaddelasAmericasPuebla/DEVELOPER/PROJECTS/DreamedJobAI/data/df_most_suitable.parquet')
df_summaries = pd.read_parquet(SAVE_PATH + "/summaries.parquet")
df_e5_data = pd.read_parquet(SAVE_PATH + "/e5_base_v2_data.parquet")


In [21]:
print(df_e5_data.info())
print(df_e5_data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   id         211 non-null    int64         
 1   original   211 non-null    object        
 2   embedding  211 non-null    object        
 3   timestamp  211 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 6.7+ KB
None
         id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [17]:
print(df_most_suitable.info())
print(df_most_suitable)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           14 non-null     int64 
 1   suitability  14 non-null     object
 2   explanation  14 non-null     object
dtypes: int64(1), object(2)
memory usage: 464.0+ bytes
None
        id           suitability                                                                                                                                                                                                                                 explanation
0   233648   Moderately Suitable    The candidate has experience in data analysis and is proficient in Python and SQL Server, which aligns with the job's requirements. However, they lack experience with GCP/AWS data infrastructure products and blockchain technologies.
1   236782   Moderately Suitable  The candidate has experience in data analysis and is proficie

In [18]:
#Get the ids
def ids_df_most_suitable(df: pd.DataFrame = df_most_suitable) -> str:
    ids = ""
    for _, row in df.iterrows():
        if "id" in row:
            if ids:
                ids += ", "
            ids += f"'{row['id']}'"

    return f"({ids})"

ids_most_suitable = ids_df_most_suitable()
logging.info(f"Getting the ids from the json object: {type(ids_most_suitable)}, {ids_most_suitable}")

def find_jobs_per_ids(ids:str, table: str = "main_jobs") -> pd.DataFrame:
    conn = psycopg2.connect(user=user, password=password, host=host, port=port, database=database)
    # Create a cursor object
    cur = conn.cursor()
    #TABLE SHOULD EITHER BE "main_jobs" or "test"
    cur.execute( f"SELECT id, title, link, location FROM {table} WHERE id IN {ids}")

    # Fetch all rows from the table
    rows = cur.fetchall()

    # Separate the columns into individual lists
    all_ids = [row[0] for row in rows]
    all_titles = [row[1] for row in rows]
    all_links = [row[2] for row in rows]
    all_locations = [row[3] for row in rows]

    df = pd.DataFrame({
        'id': all_ids,
        'title': all_titles,
        'link': all_links,
        'location': all_locations
    })
            # Close the database connection
    cur.close()
    conn.close()

    return df

df_postgre = find_jobs_per_ids(ids=ids_most_suitable)

#Read the parquet with ids & summaries
df_summaries = pd.read_parquet(SAVE_PATH + "/summaries.parquet")
#Merge it with the data in postgre
df_postgre_summaries = df_postgre.merge(df_summaries, on='id', how='inner')
#Merge with most suitable df so you have all the rows
df = df_postgre_summaries.merge(df_most_suitable, on="id", how='inner')

logging.info(f"ALL ROWS: \n {df}")


def sort_df_by_suitability(df: pd.DataFrame = df) -> pd.DataFrame:
    custom_order = {
        'Highly Suitable': 1,
        'Moderately Suitable': 2,
        'Potentially Suitable': 3,
        'Marginally Suitable': 4,
        'Not Suitable': 5
    }
    df['suitability_rank'] = df['suitability'].map(custom_order)
    sorted_df = df.sort_values(by='suitability_rank')
    sorted_df = sorted_df.drop(columns='suitability_rank')
    return sorted_df

sorted_df = sort_df_by_suitability()

filename = "/final_user_df"

sorted_df.to_parquet(SAVE_PATH + f"{filename}.parquet", index=False)

logging.info(f"SORTED DF:\n {sorted_df}. \n This df has been saved in ...{filename}.parquet")

In [19]:
print(df_final_user)

        id                                     title                                                                                      link                    location                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              