# Baseline Model (Logistic Regression)

---

**Author:** Diego Antonio Garc√≠a Padilla

**Date:** Nov 3, 2025

## Enviroment setup

In [6]:
#@title Setup & Environment Verification

import warnings
warnings.filterwarnings('ignore')

import os
import sys

print("=== ENVIRONMENT CHECK ===")
print(f"Python: {sys.version.split()[0]}")
print(f"JAVA_HOME: {os.environ.get('JAVA_HOME')}")
print(f"SPARK_HOME: {os.environ.get('SPARK_HOME')}")
print(f"Driver Memory: {os.environ.get('SPARK_DRIVER_MEMORY')}")
print(f"Executor Memory: {os.environ.get('SPARK_EXECUTOR_MEMORY')}")
print("=" * 50)

=== ENVIRONMENT CHECK ===
Python: 3.10.12
JAVA_HOME: /usr/lib/jvm/java-8-openjdk-arm64/jre
SPARK_HOME: /opt/spark
Driver Memory: 12g
Executor Memory: 8g


In [7]:
#@title Import Libraries

# PySpark
from pyspark import SparkContext
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, HashingTF, IDF, StringIndexer
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window

# SciKit Learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# TensorFlow
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalMaxPooling1D, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Data manipulation
import pandas as pd
import numpy as np

# Financial data
import yfinance as yf

# Hugging Face
from huggingface_hub import hf_hub_download

# Kaggle
import kagglehub

# Utilities
from datetime import datetime, timedelta
import json
import requests
import logging
from tqdm import tqdm
import time
import subprocess
from pathlib import Path

In [8]:
#@title Start Spark session

print("=== PRE-FLIGHT CHECK ===")

# Verify Java is available
try:
    java_version = subprocess.check_output(['java', '-version'], stderr=subprocess.STDOUT)
    print("Java: ‚úÖ Available")
except Exception as e:
    print(f"Java: ‚ùå Not available - {e}")

print("=" * 50)

# üî• STOP any existing Spark sessions first
try:
    SparkContext.getOrCreate().stop()
    print("üßπ Cleaned up existing Spark session")
except:
    print("üÜï No existing session to clean")

print("=" * 50)

# Create fresh Spark session
spark = SparkSession.builder \
    .appName("Yelp_Sentiment_Analysis") \
    .master("local[*]") \
    .config("spark.driver.memory", "12g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.memory.storageFraction", "0.3") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.default.parallelism", "16") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "512m") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

print("‚úÖ Spark session configured with:")
print(f"   - Driver Memory: 12GB")
print(f"   - Executor Memory: 8GB")
print(f"   - Max Result Size: 4GB")
print(f"   - Parallelism: 16 cores")
print(f"   - Shuffle Partitions: 200")

=== PRE-FLIGHT CHECK ===
Java: ‚úÖ Available
üßπ Cleaned up existing Spark session
‚úÖ Spark session configured with:
   - Driver Memory: 12GB
   - Executor Memory: 8GB
   - Max Result Size: 4GB
   - Parallelism: 16 cores
   - Shuffle Partitions: 200


## Feature Engineering

In [9]:
#@title Load dataset

# Parquet path
parquet_path = "../data/clean/yelp_reviews_tokenized.parquet"

yelp_df = spark.read.parquet(parquet_path)

print("\nüìä Creating train/validation/test splits...")

# Split: 70% train, 15% validation, 15% test
train_df, temp_df = yelp_df.randomSplit([0.7, 0.3], seed=42)
val_df, test_df = yelp_df.randomSplit([0.5, 0.5], seed=42)

# Show schema to understand structure
print("\nüìã Schema of Yelp Reviews:")
yelp_df.printSchema()

# Samplo
print("\nüìã Sample:")
yelp_df.show(5, truncate=80)


üìä Creating train/validation/test splits...

üìã Schema of Yelp Reviews:
root
 |-- text: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- text_length: integer (nullable = true)
 |-- word_count: integer (nullable = true)
 |-- text_clean: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tokens_filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)


üìã Sample:
+--------------------------------------------------------------------------------+---------+-----------+----------+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+
|                                                                            text|sentiment|text_length|word_count|                                            

## Logistic Regression with MLlib (baseline)

In [10]:
#@title Logistic Regression with TF-IDF (MLlib)

print("=" * 80)
print("üéØ BASELINE MODEL: Logistic Regression with TF-IDF")
print("=" * 80)

# Step 1: Convert sentiment labels to numerical indices
# 0 -> Negative
# 1 -> Neutral
# 2 -> Positive
label_indexer = StringIndexer(inputCol="sentiment", outputCol="label")

# Step 2: TF-IDF feature extraction
# HashingTF: converts tokens to term frequency vectors
hashingTF = HashingTF(inputCol="tokens_filtered", outputCol="raw_features", numFeatures=10000)

# IDF: applies inverse document frequency weighting
idf = IDF(inputCol="raw_features", outputCol="features")

# Step 3: Logistic Regression classifier
lr = LogisticRegression(
    maxIter=20,
    regParam=0.01,  # L2 regularization
    elasticNetParam=0.0  # Pure L2 (ridge)
)

# Create pipeline
baseline_pipeline = Pipeline(stages=[label_indexer, hashingTF, idf, lr])

# Train model
print("\n‚è≥ Training baseline model...")
baseline_model = baseline_pipeline.fit(train_df)

# Make predictions
print("\nüìä Making predictions on validation set...")
val_predictions = baseline_model.transform(val_df)
test_predictions = baseline_model.transform(test_df)

# Evaluate
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="accuracy"
)

evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="f1"
)

val_accuracy = evaluator_accuracy.evaluate(val_predictions)
val_f1 = evaluator_f1.evaluate(val_predictions)

test_accuracy = evaluator_accuracy.evaluate(test_predictions)
test_f1 = evaluator_f1.evaluate(test_predictions)

print("\n" + "=" * 80)
print("üìà BASELINE RESULTS")
print("=" * 80)
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation F1-Score: {val_f1:.4f}")
print(f"Test Accuracy:       {test_accuracy:.4f}")
print(f"Test F1-Score:       {test_f1:.4f}")
print("=" * 80)

# Show confusion matrix (validation)
print("\nüîç Sample predictions (validation):")
val_predictions.select('text', 'sentiment', 'prediction').show(5, truncate=80)

# Save baseline model
baseline_model_path = "../models/baseline_lr_tfidf"
baseline_model.write().overwrite().save(baseline_model_path)
print(f"\nüíæ Baseline model saved to: {baseline_model_path}")

üéØ BASELINE MODEL: Logistic Regression with TF-IDF

‚è≥ Training baseline model...


                                                                                


üìä Making predictions on validation set...


                                                                                


üìà BASELINE RESULTS
Validation Accuracy: 0.8676
Validation F1-Score: 0.8512
Test Accuracy:       0.8628
Test F1-Score:       0.8459

üîç Sample predictions (validation):


                                                                                

+--------------------------------------------------------------------------------+---------+----------+
|                                                                            text|sentiment|prediction|
+--------------------------------------------------------------------------------+---------+----------+
|!!!BEWARE, MAJOR SCAMMER, RIP OFF!!!\nI called this company because of the go...| negative|       1.0|
|"A" for effort and service. Same for price, very reasonable. Wine list expans...|  neutral|       1.0|
|"C-"I know I know... it's a buffet but dear god! That was not sushi, not sure...|  neutral|       0.0|
|"Come on Man"I only wanted a decent Saturday morning breakfast and received a...| negative|       1.0|
|                     "Delicious" -Erock, Dan, Zeb\n\nChicken fried rice was tops| positive|       0.0|
+--------------------------------------------------------------------------------+---------+----------+
only showing top 5 rows


üíæ Baseline model saved to: ../model

In [11]:
#@title üíæ Export predictions for Tableau (FIXED - Direct Spark Export)

print("=" * 80)
print("üìä PREPARING DATA FOR TABLEAU")
print("=" * 80)

# 1. Get predictions with all relevant info
tableau_predictions = test_predictions.select(
    F.col('text'),
    F.col('sentiment').alias('true_sentiment'),
    F.col('prediction'),
    F.col('text_length'),
    F.col('word_count')
).withColumn('predicted_sentiment',
    F.when(F.col('prediction') == 0.0, 'negative')
    .when(F.col('prediction') == 1.0, 'neutral')
    .when(F.col('prediction') == 2.0, 'positive')
    .otherwise('unknown')
).withColumn('is_correct',
    F.when(F.col('true_sentiment') == F.col('predicted_sentiment'), 1).otherwise(0)
)

print("\nüìã Sample data for Tableau:")
tableau_predictions.show(5, truncate=80)

# 2. Create confusion matrix data
print("\nüìä Creating confusion matrix data...")
confusion_matrix = test_predictions.groupBy('sentiment', 'prediction').count()

confusion_matrix_labeled = confusion_matrix.withColumn('predicted_sentiment',
    F.when(F.col('prediction') == 0.0, 'negative')
    .when(F.col('prediction') == 1.0, 'neutral')
    .when(F.col('prediction') == 2.0, 'positive')
    .otherwise('unknown')
).select(
    F.col('sentiment').alias('true_sentiment'),
    F.col('predicted_sentiment'),
    F.col('count')
)

print("\nüî¢ Confusion Matrix:")
confusion_matrix_labeled.orderBy('true_sentiment', 'predicted_sentiment').show()

# 3. Calculate metrics per class
print("\nüìà Creating per-class metrics...")

predictions_with_labels = test_predictions.select('label', 'prediction')

# Calculate metrics for each class
metrics_list = []
for class_idx, class_name in enumerate(['negative', 'neutral', 'positive']):
    tp = predictions_with_labels.filter(
        (F.col('label') == class_idx) & (F.col('prediction') == class_idx)
    ).count()
    
    fp = predictions_with_labels.filter(
        (F.col('label') != class_idx) & (F.col('prediction') == class_idx)
    ).count()
    
    fn = predictions_with_labels.filter(
        (F.col('label') == class_idx) & (F.col('prediction') != class_idx)
    ).count()
    
    tn = predictions_with_labels.filter(
        (F.col('label') != class_idx) & (F.col('prediction') != class_idx)
    ).count()
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    metrics_list.append({
        'sentiment': class_name,
        'true_positives': tp,
        'false_positives': fp,
        'false_negatives': fn,
        'true_negatives': tn,
        'precision': round(precision, 4),
        'recall': round(recall, 4),
        'f1_score': round(f1, 4)
    })

metrics_df = spark.createDataFrame(metrics_list)
print("\nüìä Per-class metrics:")
metrics_df.show()

# 4. Text length analysis
print("\nüìè Creating text length analysis...")
length_analysis = tableau_predictions.groupBy('true_sentiment', 'is_correct').agg(
    F.avg('text_length').alias('avg_text_length'),
    F.avg('word_count').alias('avg_word_count'),
    F.count('*').alias('count')
)

print("\nüìä Text length by sentiment and correctness:")
length_analysis.show()

# 5. Export everything using Spark (NO PANDAS!)
print("\nüíæ Exporting to CSV for Tableau using Spark...")

output_dir = "../data/tableau"

# SAVE DIRECTLY WITH SPARK - No toPandas()!
print("\n‚è≥ Saving predictions sample...")
tableau_predictions.limit(10000).coalesce(1).write.mode('overwrite').option('header', 'true').csv(f"{output_dir}/predictions_sample")

print("\n‚è≥ Saving confusion matrix...")
confusion_matrix_labeled.coalesce(1).write.mode('overwrite').option('header', 'true').csv(f"{output_dir}/confusion_matrix")

print("\n‚è≥ Saving per-class metrics...")
metrics_df.coalesce(1).write.mode('overwrite').option('header', 'true').csv(f"{output_dir}/metrics_per_class")

print("\n‚è≥ Saving text length analysis...")
length_analysis.coalesce(1).write.mode('overwrite').option('header', 'true').csv(f"{output_dir}/text_length_analysis")

# 6. Create and save summary
print("\n‚è≥ Saving model summary...")
overall_summary = spark.createDataFrame([{
    'model_name': 'Baseline (LR + TF-IDF)',
    'accuracy': round(test_accuracy, 4),
    'f1_score': round(test_f1, 4),
    'total_predictions': test_predictions.count(),
    'correct_predictions': tableau_predictions.filter(F.col('is_correct') == 1).count(),
    'incorrect_predictions': tableau_predictions.filter(F.col('is_correct') == 0).count()
}])

overall_summary.show()
overall_summary.coalesce(1).write.mode('overwrite').option('header', 'true').csv(f"{output_dir}/model_summary")

print("\n" + "=" * 80)
print("‚úÖ ALL DATA EXPORTED FOR TABLEAU!")
print("=" * 80)
print(f"\nüìÅ Files saved to: {output_dir}/")
print("   Each folder contains CSV files (look for part-*.csv inside each folder)")

üìä PREPARING DATA FOR TABLEAU

üìã Sample data for Tableau:


                                                                                

+--------------------------------------------------------------------------------+--------------+----------+-----------+----------+-------------------+----------+
|                                                                            text|true_sentiment|prediction|text_length|word_count|predicted_sentiment|is_correct|
+--------------------------------------------------------------------------------+--------------+----------+-----------+----------+-------------------+----------+
|! \nBreakfast buffet offers omelettes and eggs made to order and is only 20 d...|      positive|       0.0|        213|        39|           negative|         0|
|!!! Celiac Friendly !!!\n\nWhile they don't have a gluten-free menu, the staf...|      positive|       0.0|        470|        87|           negative|         0|
|!!!!!!BE AWARE!!!!! I went to this place paid their fee put the application a...|      negative|       1.0|       1613|       320|            neutral|         0|
|!!!The customer servi

                                                                                

+--------------+-------------------+------+
|true_sentiment|predicted_sentiment| count|
+--------------+-------------------+------+
|      negative|           negative| 20066|
|      negative|            neutral|134599|
|      negative|           positive|  6563|
|       neutral|           negative| 35548|
|       neutral|            neutral| 17450|
|       neutral|           positive| 16173|
|      positive|           negative|451079|
|      positive|            neutral|  8794|
|      positive|           positive|  7314|
+--------------+-------------------+------+


üìà Creating per-class metrics...


                                                                                


üìä Per-class metrics:
+--------+---------------+---------------+---------+------+---------+--------------+--------------+
|f1_score|false_negatives|false_positives|precision|recall|sentiment|true_negatives|true_positives|
+--------+---------------+---------------+---------+------+---------+--------------+--------------+
|  0.9264|          16108|          55614|   0.8902|0.9655| negative|        174785|        451079|
|  0.8358|          26629|          26244|   0.8368|0.8348|  neutral|        510114|        134599|
|   0.326|          52998|          13877|   0.5382|0.2338| positive|        614538|         16173|
+--------+---------------+---------------+---------+------+---------+--------------+--------------+


üìè Creating text length analysis...

üìä Text length by sentiment and correctness:


                                                                                

+--------------+----------+------------------+------------------+------+
|true_sentiment|is_correct|   avg_text_length|    avg_word_count| count|
+--------------+----------+------------------+------------------+------+
|      positive|         1|1084.9219305441618|202.45187312004376|  7314|
|      positive|         0|492.51661219510606| 90.96509905995786|459873|
|       neutral|         1|  663.118452722063|125.10618911174785| 17450|
|      negative|         0| 756.4438092404472|143.14610872614443|141162|
|       neutral|         0| 671.3831325767096| 125.3961640339514| 51721|
|      negative|         1|434.26975979268417| 81.64143327020831| 20066|
+--------------+----------+------------------+------------------+------+


üíæ Exporting to CSV for Tableau using Spark...

‚è≥ Saving predictions sample...


                                                                                


‚è≥ Saving confusion matrix...


                                                                                


‚è≥ Saving per-class metrics...


                                                                                


‚è≥ Saving text length analysis...


                                                                                


‚è≥ Saving model summary...


                                                                                

+--------+-------------------+--------+---------------------+--------------------+-----------------+
|accuracy|correct_predictions|f1_score|incorrect_predictions|          model_name|total_predictions|
+--------+-------------------+--------+---------------------+--------------------+-----------------+
|  0.8628|              44830|  0.8459|               652756|Baseline (LR + TF...|           697586|
+--------+-------------------+--------+---------------------+--------------------+-----------------+


‚úÖ ALL DATA EXPORTED FOR TABLEAU!

üìÅ Files saved to: ../data/tableau/
   Each folder contains CSV files (look for part-*.csv inside each folder)


                                                                                

In [12]:
#@title Consolidate CSVs

import shutil
import glob

# Move CSV files out of Spark folders
output_dir = "../data/tableau"
folders = ['predictions_sample', 'confusion_matrix', 'metrics_per_class', 
           'text_length_analysis', 'model_summary']

for folder in folders:
    folder_path = f"{output_dir}/{folder}"
    csv_files = glob.glob(f"{folder_path}/part-*.csv")
    
    if csv_files:
        csv_file = csv_files[0]
        new_path = f"{output_dir}/{folder}.csv"
        shutil.copy(csv_file, new_path)
        print(f"‚úÖ {folder}.csv created")

print("\nüéâ Clean CSVs ready for Tableau!")

‚úÖ predictions_sample.csv created
‚úÖ confusion_matrix.csv created
‚úÖ metrics_per_class.csv created
‚úÖ text_length_analysis.csv created
‚úÖ model_summary.csv created

üéâ Clean CSVs ready for Tableau!
