In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat_ws, udf
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, Word2Vec, NGram, VectorAssembler
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.classification import RandomForestClassifier
from xgboost.spark import SparkXGBClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel

# ===================== 1. Spark Init =====================
spark = SparkSession.builder.appName("Embedding_Comparison").getOrCreate()

# ===================== 2. Load Data =====================
pandas_df = pd.read_excel("processed_data6Financial.xlsx")
spark_df = spark.createDataFrame(pandas_df)

df = spark_df.select(
    col("PhraseId").cast("int"),
    col("Sentiment_Words"),
    col("Sentiment").cast("int").alias("label")
)
df = df.withColumn("sentence", concat_ws(" ", col("Sentiment_Words")))

In [2]:
# ===================== 3. Tokenizer =====================
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

# ===================== 4. TF-IDF =====================
hashing_tf = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=1000)
idf = IDF(inputCol="rawFeatures", outputCol="features_tfidf")

# ===================== 5. Word2Vec =====================
word2vec = Word2Vec(vectorSize=100, minCount=1, inputCol="words", outputCol="features_w2v")

# ===================== 6. N-Gram =====================
ngram = NGram(n=2, inputCol="words", outputCol="bigrams")
hashing_tf_ngram = HashingTF(inputCol="bigrams", outputCol="rawFeatures_ngram", numFeatures=1000)
idf_ngram = IDF(inputCol="rawFeatures_ngram", outputCol="features_ngram")

# ===================== 7. BERT =====================
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

bert_model.eval()
def get_bert_embedding(text):
    inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy().tolist()

bert_pdf = df.select("PhraseId", "sentence").toPandas()
bert_pdf["bert_features"] = bert_pdf["sentence"].apply(get_bert_embedding)
bert_df = spark.createDataFrame(bert_pdf[["PhraseId", "bert_features"]])

# Merge BERT into original df
df = df.join(bert_df, on="PhraseId")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [3]:
# ===================== 8. Feature Conversion =====================
to_vector_udf = udf(lambda v: Vectors.dense(v), VectorUDT())

df = df.withColumn("features_bert", to_vector_udf(col("bert_features")))

# ===================== 9. Feature Processing =====================
# Apply tokenization
df_tokenized = tokenizer.transform(df)

# TF-IDF
df_tfidf = hashing_tf.transform(df_tokenized)
df_tfidf = idf.fit(df_tfidf).transform(df_tfidf)

# Word2Vec
df_w2v = word2vec.fit(df_tokenized).transform(df_tokenized)

# N-Gram
df_ngram = ngram.transform(df_tokenized)
df_ngram = hashing_tf_ngram.transform(df_ngram)
df_ngram = idf_ngram.fit(df_ngram).transform(df_ngram)
# ===================== 10. Prepare Inputs =====================
data_sources = {
    "TF-IDF": (df_tfidf.select("features_tfidf", "label").withColumnRenamed("features_tfidf", "features")),
    "Word2Vec": (df_w2v.select("features_w2v", "label").withColumnRenamed("features_w2v", "features")),
    "N-Gram": (df_ngram.select("features_ngram", "label").withColumnRenamed("features_ngram", "features")),
    "BERT": df.select("features_bert", "label").withColumnRenamed("features_bert", "features")
}

In [8]:
from pyspark.ml.classification import RandomForestClassifier

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# ===================== 11. Train and Evaluate Models =====================
def train_and_evaluate_random_forest(data_source):
    # Split the data into training and test sets
    train_df, test_df = data_source.randomSplit([0.8, 0.2], seed=42)

    # Initialize the Random Forest classifier
    rf = RandomForestClassifier(
        labelCol="label",
        featuresCol="features",
        seed=42,
        numTrees=20,  # Best parameter for Word2Vec and N-grams
        maxDepth=5
    )

    # Fit the model
    rf_model = rf.fit(train_df)

    # Make predictions
    predictions = rf_model.transform(test_df)

    # Evaluate the model
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)

    return accuracy

# Function to train and evaluate XGBoost
def train_and_evaluate_xgboost(data_source):
    # Split the data into training and test sets
    train_df, test_df = data_source.randomSplit([0.8, 0.2], seed=42)

    # Initialize the XGBoost classifier
    xgb = SparkXGBClassifier(
        label_col="label",
        features_col="features",
        num_workers=1,
        num_class=5,
        max_depth=5,
        learning_rate=0.2
    )

    # Fit the model
    model = xgb.fit(train_df)

    # Make predictions
    predictions = model.transform(test_df)

    # Evaluate the model
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)

    return accuracy

# ===================== 12. Evaluate All Models =====================
accuracies = {}
for name, data in data_sources.items():
    accuracies[name + " (XGBoost)"] = train_and_evaluate_xgboost(data)
    accuracies[name + " (Random Forest)"] = train_and_evaluate_random_forest(data)

# ===================== 13. Display Results =====================
for method, accuracy in accuracies.items():
    print(f"Accuracy for {method}: {accuracy:.4f}")
if __name__ == "__main__":
    try:
        accuracies = {}
        for name, data in data_sources.items():
            accuracies[name + " (XGBoost)"] = train_and_evaluate_xgboost(data)
            accuracies[name + " (Random Forest)"] = train_and_evaluate_random_forest(data)

        for method, accuracy in accuracies.items():
            print(f"Accuracy for {method}: {accuracy:.4f}")
    except Exception as e:
        print(f"Error occurred: {str(e)}")

INFO:XGBoost-PySpark:Running xgboost-2.1.4 on 1 workers with
	booster params: {'objective': 'multi:softprob', 'device': 'cpu', 'learning_rate': 0.2, 'max_depth': 5, 'num_class': 5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
INFO:XGBoost-PySpark:Finished xgboost training!
INFO:XGBoost-PySpark:Running xgboost-2.1.4 on 1 workers with
	booster params: {'objective': 'multi:softprob', 'device': 'cpu', 'learning_rate': 0.2, 'max_depth': 5, 'num_class': 5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
INFO:XGBoost-PySpark:Finished xgboost training!
INFO:XGBoost-PySpark:Running xgboost-2.1.4 on 1 workers with
	booster params: {'objective': 'multi:softprob', 'device': 'cpu', 'learning_rate': 0.2, 'max_depth': 5, 'num_class': 5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	

Accuracy for TF-IDF (XGBoost): 0.2485
Accuracy for TF-IDF (Random Forest): 0.2840
Accuracy for Word2Vec (XGBoost): 0.2663
Accuracy for Word2Vec (Random Forest): 0.2722
Accuracy for N-Gram (XGBoost): 0.2130
Accuracy for N-Gram (Random Forest): 0.2781
Accuracy for BERT (XGBoost): 0.3526
Accuracy for BERT (Random Forest): 0.3579


INFO:XGBoost-PySpark:Running xgboost-2.1.4 on 1 workers with
	booster params: {'objective': 'multi:softprob', 'device': 'cpu', 'learning_rate': 0.2, 'max_depth': 5, 'num_class': 5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
INFO:XGBoost-PySpark:Finished xgboost training!
INFO:XGBoost-PySpark:Running xgboost-2.1.4 on 1 workers with
	booster params: {'objective': 'multi:softprob', 'device': 'cpu', 'learning_rate': 0.2, 'max_depth': 5, 'num_class': 5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
INFO:XGBoost-PySpark:Finished xgboost training!
INFO:XGBoost-PySpark:Running xgboost-2.1.4 on 1 workers with
	booster params: {'objective': 'multi:softprob', 'device': 'cpu', 'learning_rate': 0.2, 'max_depth': 5, 'num_class': 5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	

Accuracy for TF-IDF (XGBoost): 0.2485
Accuracy for TF-IDF (Random Forest): 0.2840
Accuracy for Word2Vec (XGBoost): 0.2663
Accuracy for Word2Vec (Random Forest): 0.2722
Accuracy for N-Gram (XGBoost): 0.2130
Accuracy for N-Gram (Random Forest): 0.2781
Accuracy for BERT (XGBoost): 0.3526
Accuracy for BERT (Random Forest): 0.3579


In [13]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler

# Function to train and evaluate stacking model
def train_and_evaluate_stacking(data_source):
    # Split the data into training and test sets
    train_df, test_df = data_source.randomSplit([0.8, 0.2], seed=42)

    # Train Random Forest and XGBoost models
    rf_model = RandomForestClassifier(
        labelCol="label",
        featuresCol="features",
        seed=42,
        numTrees=20,
        maxDepth=5
    ).fit(train_df)

    xgb_model = SparkXGBClassifier(
        label_col="label",
        features_col="features",
        num_workers=1,
        num_class=5,
        max_depth=5,
        learning_rate=0.2
    ).fit(train_df)

    # Make predictions with both models
    rf_predictions = rf_model.transform(test_df).select("prediction").withColumnRenamed("prediction", "rf_prediction")
    xgb_predictions = xgb_model.transform(test_df).select("prediction").withColumnRenamed("prediction", "xgb_prediction")

    # Combine predictions into a single DataFrame
    combined_predictions = test_df.select("label").join(rf_predictions).join(xgb_predictions)

    # Assemble combined features
    assembler = VectorAssembler(inputCols=["rf_prediction", "xgb_prediction"], outputCol="stacked_features")
    stacked_data = assembler.transform(combined_predictions)

    # Train meta-classifier
    meta_classifier = LogisticRegression(labelCol="label", featuresCol="stacked_features")
    meta_model = meta_classifier.fit(stacked_data)

    # Make final predictions
    final_predictions = meta_model.transform(stacked_data)

    # Evaluate the stacking model
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(final_predictions)

    return accuracy

# ===================== 14. Evaluate Stacking Model =====================
stacking_accuracies = {}
for name, data in data_sources.items():
    stacking_accuracies[name + " (Stacking)"] = train_and_evaluate_stacking(data)

# ===================== 15. Display Stacking Results =====================
for method, accuracy in stacking_accuracies.items():
    print(f"Accuracy for {method}: {accuracy:.4f}")

if __name__ == "__main__":
    try:
        stacking_accuracies = {}
        for name, data in data_sources.items():
            stacking_accuracies[name + " (Stacking)"] = train_and_evaluate_stacking(data)

        for method, accuracy in stacking_accuracies.items():
            print(f"Accuracy for {method}: {accuracy:.4f}")
    except Exception as e:
        print(f"Error occurred: {str(e)}")


INFO:XGBoost-PySpark:Running xgboost-2.1.4 on 1 workers with
	booster params: {'objective': 'multi:softprob', 'device': 'cpu', 'learning_rate': 0.2, 'max_depth': 5, 'num_class': 5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
INFO:XGBoost-PySpark:Finished xgboost training!
INFO:XGBoost-PySpark:Running xgboost-2.1.4 on 1 workers with
	booster params: {'objective': 'multi:softprob', 'device': 'cpu', 'learning_rate': 0.2, 'max_depth': 5, 'num_class': 5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
INFO:XGBoost-PySpark:Finished xgboost training!
INFO:XGBoost-PySpark:Running xgboost-2.1.4 on 1 workers with
	booster params: {'objective': 'multi:softprob', 'device': 'cpu', 'learning_rate': 0.2, 'max_depth': 5, 'num_class': 5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	

Accuracy for TF-IDF (Stacking): 0.3018
Accuracy for Word2Vec (Stacking): 0.2959
Accuracy for N-Gram (Stacking): 0.2781
Accuracy for BERT (Stacking): 0.2426


INFO:XGBoost-PySpark:Running xgboost-2.1.4 on 1 workers with
	booster params: {'objective': 'multi:softprob', 'device': 'cpu', 'learning_rate': 0.2, 'max_depth': 5, 'num_class': 5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
INFO:XGBoost-PySpark:Finished xgboost training!
INFO:XGBoost-PySpark:Running xgboost-2.1.4 on 1 workers with
	booster params: {'objective': 'multi:softprob', 'device': 'cpu', 'learning_rate': 0.2, 'max_depth': 5, 'num_class': 5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
INFO:XGBoost-PySpark:Finished xgboost training!
INFO:XGBoost-PySpark:Running xgboost-2.1.4 on 1 workers with
	booster params: {'objective': 'multi:softprob', 'device': 'cpu', 'learning_rate': 0.2, 'max_depth': 5, 'num_class': 5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	

Accuracy for TF-IDF (Stacking): 0.3018
Accuracy for Word2Vec (Stacking): 0.2959
Accuracy for N-Gram (Stacking): 0.2781
Accuracy for BERT (Stacking): 0.2426
