In [5]:
# Data Manipulation and Handling
import polars as pl
import pandas as pd
import numpy as np
import psycopg2

# DB Credentials
from dotenv import load_dotenv
import os
from sqlalchemy import create_engine

# Machine Learning Libraries
import torch
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Handling Imbalanced Data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Gradient Boosting Libraries
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Model Lifecycle Management
import mlflow
import mlflow.sklearn

# Distributed Computing
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier as SparkRFClassifier

# Model Interpretability
import shap

# Hyperparameter Optimization
import optuna

# Automated Feature Engineering
import featuretools as ft

%load_ext dotenv
%dotenv
BASE_DIR = '../SQL/'


The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [6]:
def etl_query(filename:str):
    host=os.environ['host']
    database=os.environ['database']
    user=os.environ['user']
    password=os.environ['password']
    port=os.environ['port']

    try:
        engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{database}')
        with open(BASE_DIR + filename + '.sql', 'r') as file:
            sql_query = file.read()

        output_df = pd.read_sql(sql_query, engine)
        return output_df
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [8]:
merge_sql = 'merged_data'
equity_value_data = 'equity_value_data'
chatgpt_sql_4o = 'chatGPT-model-4o'
chatgpt_sql_o1 = 'chatGPT-model-o1'
chatgpt_sql_o1_q1='chatGPT-model-o1-Q1'
df = etl_query(equity_value_data)
df

Unnamed: 0,timestamp,close_equity,user_id
0,2016-11-16,48.16,bcef4fa9b0bdf22bcf7deae708decf03
1,2016-11-17,48.16,bcef4fa9b0bdf22bcf7deae708decf03
2,2016-11-18,48.16,bcef4fa9b0bdf22bcf7deae708decf03
3,2016-11-21,48.16,bcef4fa9b0bdf22bcf7deae708decf03
4,2016-11-22,48.16,bcef4fa9b0bdf22bcf7deae708decf03
...,...,...,...
1119153,2017-08-14,2270.71,98580360c4fb5b0ec511cd87f0d429ed
1119154,2017-08-15,2275.02,98580360c4fb5b0ec511cd87f0d429ed
1119155,2017-08-16,2282.03,98580360c4fb5b0ec511cd87f0d429ed
1119156,2017-08-17,2237.45,98580360c4fb5b0ec511cd87f0d429ed
