In [None]:
import numpy as np
from dotenv import load_dotenv
import os
from sqlalchemy import create_engine, text
import pandas as pd
import json
from naivebayes import NaiveBayesTextClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
import time
import math

In [None]:
load_dotenv()

In [None]:
username = os.getenv('SNOWFLAKE_USER')
password = os.getenv('SNOWFLAKE_PASSWORD')
account = os.getenv('SNOWFLAKE_ACCOUNT')
warehouse = os.getenv('SNOWFLAKE_WAREHOUSE')
database = os.getenv('SNOWFLAKE_DATABASE')
schema = os.getenv('SNOWFLAKE_SCHEMA')

In [None]:
engine = create_engine(
    f'snowflake://{username}:{password}@{account}/{database}/{schema}?warehouse={warehouse}'
)

In [None]:
query = """
CREATE OR REPLACE TABLE MODEL_PARAMETERS AS
SELECT results.*
FROM (
    SELECT 'TRAIN' AS mode,
           parse_json(u.COL1):label::string AS label,
           parse_json(u.COL1):text::string AS text
    FROM TRAINING AS u
    WHERE parse_json(u.COL1):label::string IN ('0', '1') -- Filter labels
    LIMIT 1000
) AS data,
     TABLE(train_and_predict_classifier(data.mode, data.label, data.text) OVER ()) AS results
     WHERE results.feature != '__PRIOR__'; -- Exclude prior probabilities
"""


In [None]:
execution_times = []

try:
    with engine.connect() as connection:
        for i in range(100):
            start_time = time.time()
            result = connection.execute(text(query))
            rows = result.fetchall()
            end_time = time.time()
            execution_times.append(end_time - start_time)
            print(f"Iteration {i+1}: Query executed in {end_time - start_time:.4f} seconds.")
except Exception as e:
    print(f"Error executing query: {e}")

average_time = sum(execution_times) / len(execution_times) if execution_times else 0
execution_summary = {
    "iterations": len(execution_times),
    "average_time": average_time,
    "total_time": sum(execution_times),
    "max_time": max(execution_times, default=0),
    "min_time": min(execution_times, default=0),
}

print("\nExecution Summary:")
print(f"Total Iterations: {execution_summary['iterations']}")
print(f"Average Time: {execution_summary['average_time']:.4f} seconds")
print(f"Total Time: {execution_summary['total_time']:.4f} seconds")
print(f"Max Time: {execution_summary['max_time']:.4f} seconds")
print(f"Min Time: {execution_summary['min_time']:.4f} seconds")

In [None]:
file_path = "execution_times.txt"

try:
    with open(file_path, "w") as file:
        for time in execution_times:
            file.write(f"{time:.4f}\n")  
    print(f"Execution times saved to {file_path}")
except Exception as e:
    print(f"Error saving execution times to file: {e}")


In [None]:
query = "SELECT * FROM MODEL_PARAMETERS"

In [None]:
with engine.connect() as connection:
    result = connection.execute(text(query))
    rows = result.fetchall()

In [None]:
trained = pd.DataFrame(rows)

### Transform the test data

In [None]:
train = pd.read_parquet('yelp_review_full/yelp_review_full/train-00000-of-00001.parquet')

In [None]:
train = train[:1000]

In [None]:
train = train[train['label'].isin([0,1])]

In [None]:
test = pd.read_parquet('yelp_review_full/yelp_review_full/test-00000-of-00001.parquet')

In [None]:
test = test[test['label'].isin([0,1])]

In [None]:
test = test[:100]

In [None]:
vectorizer = CountVectorizer()
training_data = vectorizer.fit_transform(train['text'])
test_data = vectorizer.transform(test['text'])

#### Predict

In [None]:
def predict(X_test, df_train):
    prior_1 = float(df_train.loc[(df_train['feature'] == '__PRIOR__') & (df_train['label'] == '1'), 'probability'].iloc[0])
    prior_0 = float(df_train.loc[(df_train['feature'] == '__PRIOR__') & (df_train['label'] == '0'), 'probability'].iloc[0])
    
    df_train_filtered = df_train[df_train['feature'] != '__PRIOR__'].copy()
    
    label_word_probs = {}
    labels = df_train['label'].unique()
    for label in labels:
        label_word_probs[label] = df_train_filtered[df_train_filtered['label'] == label].set_index('feature')['probability'].to_dict()
    
    predictions = []
    
    for test_vector in X_test.toarray():
        results = {}
        for label in labels:
            class_probabilities = 0
            word_probs = label_word_probs[label]
            for i, word_count in enumerate(test_vector):
                if word_count > 0:
                    word_prob = word_probs.get(i, 0)  
                    if word_prob > 0:
                        class_probabilities += word_count * math.log(word_prob)
            
            if label == '1':
                results[label] = math.log(prior_1) + class_probabilities
            else:
                results[label] = math.log(prior_0) + class_probabilities
        
        predictions.append(max(results, key=results.get))
    
    return predictions


In [None]:
ls  = predict(test_data,trained)