In [1]:
import numpy as np
from dotenv import load_dotenv
import os
from sqlalchemy import create_engine, text
import pandas as pd
import json
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
import time
import math

In [2]:
load_dotenv()

True

In [3]:
username = os.getenv('SNOWFLAKE_USER')
password = os.getenv('SNOWFLAKE_PASSWORD')
account = os.getenv('SNOWFLAKE_ACCOUNT')
warehouse = os.getenv('SNOWFLAKE_WAREHOUSE')
database = os.getenv('SNOWFLAKE_DATABASE')
schema = os.getenv('SNOWFLAKE_SCHEMA')

In [4]:
engine = create_engine(
    f'snowflake://{username}:{password}@{account}/{database}/{schema}?warehouse={warehouse}'
)

In [5]:
query_Python = "SELECT * FROM MODEL_PARAMETERS"
with engine.connect() as connection:
    result = connection.execute(text(query_Python))
    rows = result.fetchall()
trained_Python = pd.DataFrame(rows)

In [6]:
query_SQL = "SELECT * FROM NB_MODEL"
with engine.connect() as connection:
    result = connection.execute(text(query_SQL))
    rows = result.fetchall()
trained_SQL = pd.DataFrame(rows)

### Transform the test data

In [7]:
train = pd.read_parquet('../yelp_review_full/yelp_review_full/train-00000-of-00001.parquet')

In [8]:
train = train[:1000]

In [9]:
train = train[train['label'].isin([0,1])]

In [10]:
test = pd.read_parquet('../yelp_review_full/yelp_review_full/test-00000-of-00001.parquet')

In [11]:
test = test[test['label'].isin([0,1])]

In [12]:
test = test[:100]

In [13]:
y_true = list(test['label'].astype(str))

In [14]:
vectorizer = CountVectorizer()
training_data = vectorizer.fit_transform(train['text'])
test_data = vectorizer.transform(test['text'])

#### Predict

In [15]:
def predict(X_test, df_train):
    prior_1 = float(df_train.loc[(df_train['feature'] == '__PRIOR__') & (df_train['label'] == '1'), 'probability'].iloc[0])
    prior_0 = float(df_train.loc[(df_train['feature'] == '__PRIOR__') & (df_train['label'] == '0'), 'probability'].iloc[0])

    df_train_filtered = df_train[df_train['feature'] != '__PRIOR__'].copy()

    label_word_probs = {}
    labels = df_train['label'].unique()
    for label in labels:
        label_word_probs[label] = df_train_filtered[df_train_filtered['label'] == label].set_index('feature')['probability'].to_dict()

    feature_names = df_train_filtered['feature'].unique()

    log_prior_1 = math.log(prior_1)
    log_prior_0 = math.log(prior_0)

    predictions = []

    for test_vector in X_test.toarray():
        results = {}
        for label in labels:
            class_probabilities = 0
            word_probs = label_word_probs[label]
            for i, word_count in enumerate(test_vector):
                if word_count > 0:
                    word_name = feature_names[i]
                    word_prob = word_probs.get(word_name, 1e-10)  # Use a small value for unseen words
                    if float(word_prob) > 0:
                        class_probabilities += word_count * math.log(float(word_prob))

            if label == '1':
                results[label] = log_prior_1 + class_probabilities
            else:
                results[label] = log_prior_0 + class_probabilities

        predictions.append(max(results, key=results.get))

    return predictions


In [16]:
predict_Python  = predict(test_data,trained_Python)

In [17]:
print(classification_report(y_true,predict_Python))

              precision    recall  f1-score   support

           0       0.28      0.27      0.27        45
           1       0.42      0.44      0.43        55

    accuracy                           0.36       100
   macro avg       0.35      0.35      0.35       100
weighted avg       0.36      0.36      0.36       100



In [18]:
predict_SQL = predict(test_data,trained_Python)

In [21]:
print(classification_report(y_true,predict_SQL))

              precision    recall  f1-score   support

           0       0.28      0.27      0.27        45
           1       0.42      0.44      0.43        55

    accuracy                           0.36       100
   macro avg       0.35      0.35      0.35       100
weighted avg       0.36      0.36      0.36       100

