In [None]:
import numpy as np
from dotenv import load_dotenv
import os
from sqlalchemy import create_engine, text
import pandas as pd
import json
# from naivebayes import NaiveBayesTextClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
import time
import math

In [None]:
load_dotenv()

In [None]:
username = os.getenv('SNOWFLAKE_USER')
password = os.getenv('SNOWFLAKE_PASSWORD')
account = os.getenv('SNOWFLAKE_ACCOUNT')
warehouse = os.getenv('SNOWFLAKE_WAREHOUSE')
database = os.getenv('SNOWFLAKE_DATABASE')
schema = os.getenv('SNOWFLAKE_SCHEMA')

In [None]:
engine = create_engine(
    f'snowflake://{username}:{password}@{account}/{database}/{schema}?warehouse={warehouse}'
)

In [None]:
# query = [
# """
# CREATE OR REPLACE TABLE MODEL_PARAMETERS AS
# SELECT results.*
# FROM (
#     SELECT 'TRAIN' AS mode,
#            parse_json(u.COL1):label::string AS label,
#            parse_json(u.COL1):text::string AS text
#     FROM TRAINING AS u
#     WHERE parse_json(u.COL1):label::string IN ('0', '1') -- Filter labels
#     LIMIT 1000
# ) AS data,
#      TABLE(train_and_predict_classifier(data.mode, data.label, data.text) OVER ()) AS results
# """
# ]

In [None]:
queries = [
    """
    CREATE OR REPLACE TABLE TRAINING_RAW AS
    SELECT 
      CAST(PARSE_JSON(COL1):"label" AS INT) AS LABEL,
      CAST(PARSE_JSON(COL1):"text" AS VARCHAR) AS TEXT
    FROM TRAINING
    WHERE CAST(PARSE_JSON(COL1):"label" AS INT) IN (0, 1)
    """,
    """
    CREATE OR REPLACE TABLE TOKENIZED_DATA AS
    SELECT
      t.LABEL,
      t2.VALUE AS TOKEN
    FROM (
      SELECT LABEL, TEXT
      FROM TRAINING_RAW
      LIMIT 1000
    ) t,
    LATERAL SPLIT_TO_TABLE(t.TEXT, ' ') t2
    """,
    """
    CREATE OR REPLACE TABLE CLEAN_TOKENS AS
    SELECT
      LABEL,
      LOWER(REGEXP_REPLACE(TOKEN, '[^a-zA-Z0-9]', '')) AS TOKEN
    FROM TOKENIZED_DATA
    WHERE TOKEN <> ''
    """,
    """
    CREATE OR REPLACE TABLE LABEL_COUNTS AS
    SELECT
      LABEL,
      COUNT(*) AS LABEL_COUNT
    FROM CLEAN_TOKENS
    GROUP BY LABEL
    """,
    """
    CREATE OR REPLACE TABLE FEATURE_LABEL_COUNTS AS
    SELECT
      TOKEN AS FEATURE,
      LABEL,
      COUNT(*) AS FEATURE_LABEL_COUNT
    FROM CLEAN_TOKENS
    GROUP BY TOKEN, LABEL
    """,
    """
    CREATE OR REPLACE TABLE TOTAL_COUNT AS
    SELECT SUM(LABEL_COUNT) AS TOTAL
    FROM LABEL_COUNTS
    """,
    """
    CREATE OR REPLACE TABLE LABEL_PROBABILITIES AS
    SELECT
      LABEL,
      (LABEL_COUNT * 1.0) / (SELECT TOTAL FROM TOTAL_COUNT) AS P_LABEL
    FROM LABEL_COUNTS
    """,
    """
    CREATE OR REPLACE TABLE FEATURE_PROBABILITIES AS
    SELECT
        FL.FEATURE,
        FL.LABEL,
        (FL.FEATURE_LABEL_COUNT * 1.0) / LC.LABEL_COUNT AS P_FEATURE_LABEL
    FROM FEATURE_LABEL_COUNTS FL
    JOIN LABEL_COUNTS LC ON FL.LABEL = LC.LABEL
    """,
    """
    CREATE OR REPLACE TABLE NB_MODEL (
        FEATURE VARCHAR(16777216),
        LABEL VARCHAR(16777216),
        PROBABILITY VARCHAR(16777216)
    ) AS
    SELECT
        FEATURE,
        LABEL,
        TO_VARCHAR(P_FEATURE_LABEL) AS PROBABILITY
    FROM FEATURE_PROBABILITIES
    """
]

In [None]:
per_query_times = {f"query_{i+1}": [] for i in range(len(queries))}
total_execution_times = []

try:
    with engine.connect() as connection:
        for _ in range(100):
            start_time = time.time()
            for i, q in enumerate(queries, start=1):
                q_start = time.time()
                connection.execute(text(q))
                q_end = time.time()
                per_query_times[f"query_{i}"].append(q_end - q_start)
            end_time = time.time()
            print(f"Iteration {_+1}: Queries executed in {end_time - start_time:.4f} seconds.")
            total_execution_times.append(end_time - start_time)

except Exception as e:
    print(f"Error executing query: {e}")

# Create the JSON structure
output_data = {
    **per_query_times,
}

# Save to JSON
output_file = "per_query_times.json"
with open(output_file, 'w') as f:
    json.dump(output_data, f, indent=4)

print(f"Per-query times and total execution times have been saved to {output_file}")


In [None]:
file_path = "execution_times.txt"

try:
    with open(file_path, "w") as file:
        for time in total_execution_times:
            file.write(f"{time:.4f}\n")  
    print(f"Execution times saved to {file_path}")
except Exception as e:
    print(f"Error saving execution times to file: {e}")


In [None]:
query = "SELECT * FROM MODEL_PARAMETERS"

In [None]:
with engine.connect() as connection:
    result = connection.execute(text(query))
    rows = result.fetchall()

In [None]:
trained = pd.DataFrame(rows)

### Transform the test data

In [None]:
train = pd.read_parquet('yelp_review_full/yelp_review_full/train-00000-of-00001.parquet')

In [None]:
train = train[:1000]

In [None]:
train = train[train['label'].isin([0,1])]

In [None]:
test = pd.read_parquet('yelp_review_full/yelp_review_full/test-00000-of-00001.parquet')

In [None]:
test = test[test['label'].isin([0,1])]

In [None]:
test = test[:100]

In [None]:
vectorizer = CountVectorizer()
training_data = vectorizer.fit_transform(train['text'])
test_data = vectorizer.transform(test['text'])

#### Predict

In [None]:
def predict(X_test, df_train):
    prior_1 = float(df_train.loc[(df_train['feature'] == '__PRIOR__') & (df_train['label'] == '1'), 'probability'].iloc[0])
    prior_0 = float(df_train.loc[(df_train['feature'] == '__PRIOR__') & (df_train['label'] == '0'), 'probability'].iloc[0])
    
    df_train_filtered = df_train[df_train['feature'] != '__PRIOR__'].copy()
    
    label_word_probs = {}
    labels = df_train['label'].unique()
    for label in labels:
        label_word_probs[label] = df_train_filtered[df_train_filtered['label'] == label].set_index('feature')['probability'].to_dict()
    
    predictions = []
    
    for test_vector in X_test.toarray():
        results = {}
        for label in labels:
            class_probabilities = 0
            word_probs = label_word_probs[label]
            for i, word_count in enumerate(test_vector):
                if word_count > 0:
                    word_prob = word_probs.get(i, 0)  
                    if word_prob > 0:
                        class_probabilities += word_count * math.log(word_prob)
            
            if label == '1':
                results[label] = math.log(prior_1) + class_probabilities
            else:
                results[label] = math.log(prior_0) + class_probabilities
        
        predictions.append(max(results, key=results.get))
    
    return predictions


In [None]:
ls  = predict(test_data,trained)