In [1]:
import os
spark_home = os.path.abspath(os.getcwd() + "/../spark-3.5.5-bin-hadoop3")
hadoop_home = os.path.abspath(os.getcwd() + "/../winutils")
print(f"I am using the following SPARK_HOME: {spark_home}")
if os.name == 'nt':
    os.environ["HADOOP_HOME"] = f"{hadoop_home}"
    print(f"Windows detected: set HADOOP_HOME to: {os.environ['HADOOP_HOME']}")
    hadoop_bin = os.path.join(hadoop_home, "bin")
    os.environ["PATH"] = f"{hadoop_bin};{os.environ['PATH']}"
    print(f"  Also added Hadoop bin directory to PATH: {hadoop_bin}")

import findspark
import pyspark
from pyspark.streaming import StreamingContext

findspark.init(spark_home)
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession.builder.getOrCreate()


I am using the following SPARK_HOME: C:\Users\arthu\Desktop\spark\spark-3.5.5-bin-hadoop3
Windows detected: set HADOOP_HOME to: C:\Users\arthu\Desktop\spark\winutils
  Also added Hadoop bin directory to PATH: C:\Users\arthu\Desktop\spark\winutils\bin


In [2]:
from pyspark.sql.functions import col, concat_ws

# Load your JSON file
df = spark.read.option("multiLine", True).json("C:/Users/arthu/Desktop/spark/Data JSON/data_ass3.json")

# Show original schema to verify structure
df.printSchema()

# Use the first category as the label (fine-grained)
df = df.withColumn("main_category", col("categories")[0])

# Join title and summary into one feature column
df = df.withColumn("text", concat_ws(" ", col("title"), col("summary")))

# Trimming leading and trailing white spaces 
from pyspark.sql.functions import trim
df = df.withColumn("text", trim(col("text")))

#Lowercase text 
from pyspark.sql.functions import lower
df = df.withColumn("text", lower(col("text")))

# Preview result
df.select("text", "main_category").show(5, truncate=False)


root
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: string (nullable = true)
 |-- published: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- title: string (nullable = true)
 |-- updated: string (nullable = true)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## 3.1 Logistic Regression

In [3]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline


## 3.1 Logistic Regression

In [4]:
# Tokenize text into words
tokenizer = Tokenizer(inputCol="text", outputCol="words")

# Remove stop words
stopwords = StopWordsRemover(inputCol="words", outputCol="filtered")

# Convert words to term frequencies
tf = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)

# Compute IDF (inverse document frequency)
idf = IDF(inputCol="rawFeatures", outputCol="features")

# Index the label (main_category) as numerical class
label_indexer = StringIndexer(inputCol="main_category", outputCol="label")

# Classifier
lr = LogisticRegression(maxIter=10, regParam=0.01)

# Assemble into pipeline
pipeline_LR = Pipeline(stages=[tokenizer, stopwords, tf, idf, label_indexer, lr])


In [5]:
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)


In [6]:
model = pipeline_LR.fit(train_data)


## 4. Model Evaluation

In [7]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [8]:
predictions = model.transform(test_data)

# Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Test set accuracy: {accuracy:.4f}")


Test set accuracy: 0.5969


In [9]:
predictions.select("main_category", "prediction", "probability", "text").show(5, truncate=False)


+-------------+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [9]:
from pyspark.ml.feature import IndexToString

# Get the fitted StringIndexerModel from the pipeline
label_index_model = model.stages[4]  # Assuming indexer was 5th in your pipeline

# Reverse the numeric prediction back to original label string
label_reverse = IndexToString(
    inputCol="prediction",
    outputCol="predicted_category",
    labels=label_index_model.labels
)

# Apply it
predictions_labeled = label_reverse.transform(predictions)

# View with readable category
predictions_labeled.select("main_category", "predicted_category", "probability").show(5, truncate=False)



+-------------+------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [10]:
import os
os.environ["HADOOP_HOME"] = "C:/Users/arthu/Desktop/spark/winutils"


## 3.2 Random Forest

In [10]:
## Trying improvements with random forest (sparse, not dense, as this crashes my session)
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.sql.functions import udf
from pyspark.ml.linalg import DenseVector, VectorUDT

# Build minimal pipeline
tokenizer = Tokenizer(inputCol="text", outputCol="words")
stopwords = StopWordsRemover(inputCol="words", outputCol="filtered")
tf = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000)
idf = IDF(inputCol="rawFeatures", outputCol="features")
label_indexer = StringIndexer(
    inputCol="main_category",
    outputCol="label",
    handleInvalid="keep"
)

rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10, maxDepth=5)

pipeline_rf = Pipeline(stages=[tokenizer, stopwords, tf, idf, label_indexer, rf])

# Fit the model
model = pipeline_rf.fit(train_data)

# Predict
predictions_rf = model.transform(test_data)


## Evaluation

In [11]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [12]:
# Logistic Regression
acc_lr = evaluator.evaluate(predictions)
print(f"LogisticRegression accuracy: {acc_lr:.4f}")

LogisticRegression accuracy: 0.5969


In [13]:
# Random Forest
acc_rf = evaluator.evaluate(predictions_rf)
print(f"RandomForest accuracy: {acc_rf:.4f}")

RandomForest accuracy: 0.1784


## Random forest tuning with Cross Validation
The poor performance of the Random Forest model is expected. TF-IDF produces high-dimensional sparse vectors, which are not ideal for tree-based models. We also used basic hyperparameters (10 trees, max depth of 5), as larger settings caused runtime issues. Additionally, we avoided dense vectorization due to memory constraints, which may further reduce the model's ability to make effective splits.

In [14]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Slim grid for quick testing (my laptop cannot handle bigger)
paramGrid_rf = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10]) \
    .addGrid(rf.maxDepth, [5, 10]) \
    .build()

cv_rf = CrossValidator(
    estimator=pipeline_rf,
    estimatorParamMaps=paramGrid_rf,
    evaluator=evaluator,
    numFolds=2  # use 2 instead of 3 for speed
)

# Fit the lighter model
cv_model_rf = cv_rf.fit(train_data)

# Predict and evaluate
predictions_rf_tuned = cv_model_rf.transform(test_data)
acc_rf_tuned = evaluator.evaluate(predictions_rf_tuned)
print(f"Tuned Random Forest accuracy (lite): {acc_rf_tuned:.4f}")



Tuned Random Forest accuracy (lite): 0.2225


## Alternative feature representations
Word2vec: works better than TF_IDF for random forest, but we also keep the latter for comparison in early analysis with logistic regression, where TF-IDF is better (more interpretable than Word2vec)


### Word2Vec for RF

In [15]:
from pyspark.ml.feature import Word2Vec
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline

# Updated feature extraction: Word2Vec instead of TF-IDF (again smaller sizes due to runtime)
tokenizer = Tokenizer(inputCol="text", outputCol="words")
stopwords = StopWordsRemover(inputCol="words", outputCol="filtered")
word2vec = Word2Vec(inputCol="filtered", outputCol="features", vectorSize=10, minCount=5)

label_indexer = StringIndexer(inputCol="main_category", outputCol="label", handleInvalid="keep")
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10, maxDepth=5)

pipeline_w2v_rf = Pipeline(stages=[tokenizer, stopwords, word2vec, label_indexer, rf])

# Train
model_w2v_rf = pipeline_w2v_rf.fit(train_data)

# Predict
predictions_w2v_rf = model_w2v_rf.transform(test_data)

# Evaluate
acc_w2v_rf = evaluator.evaluate(predictions_w2v_rf)
print(f"Random Forest with Word2Vec accuracy: {acc_w2v_rf:.4f}")



Random Forest with Word2Vec accuracy: 0.2888


### word2vec for logistic regression

In [16]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, Word2Vec, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# --- 4. Evaluator ---
evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

# --- 5. Word2Vec + Logistic Regression Pipeline ---
tokenizer = Tokenizer(inputCol="text", outputCol="words")
stopwords = StopWordsRemover(inputCol="words", outputCol="filtered")
word2vec = Word2Vec(inputCol="filtered", outputCol="features", vectorSize=100, minCount=5)

label_indexer = StringIndexer(inputCol="main_category", outputCol="label", handleInvalid="keep")
lr = LogisticRegression(labelCol="label", featuresCol="features")

pipeline_w2v_lr = Pipeline(stages=[tokenizer, stopwords, word2vec, label_indexer, lr])

model_w2v_lr = pipeline_w2v_lr.fit(train_data)
predictions_w2v_lr = model_w2v_lr.transform(test_data)

acc_w2v_lr = evaluator.evaluate(predictions_w2v_lr)
print(f"Logistic Regression with Word2Vec accuracy: {acc_w2v_lr:.4f}")

Logistic Regression with Word2Vec accuracy: 0.5788


# Small LLM: Bart Facebook on small number of articles

install transformers torch
with pixi in terminal folder where pixi.toml is

pixi add conda-forge::transformers conda-forge::pytorch


In [4]:
from transformers import pipeline
import pandas as pd

# Load the BART model for zero-shot classification
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Use your existing df with 'text' and 'main_category'
sample_df = df.select("text", "main_category").limit(10).toPandas()

# Get distinct labels from your own data
candidate_labels = sample_df["main_category"].unique().tolist()

# Run zero-shot classification
results = []
for _, row in sample_df.iterrows():
    text = row["text"][:1000]  # truncate to stay within model limits
    true_label = row["main_category"]

    prediction = classifier(text, candidate_labels)
    predicted_label = prediction["labels"][0]

    results.append({
        "true": true_label,
        "predicted": predicted_label,
        "scores": prediction["scores"]
    })

# Display result comparison
results_df = pd.DataFrame(results)
display(results_df)


Device set to use cpu


Unnamed: 0,true,predicted,scores
0,hep-lat,quant-ph,"[0.2549302279949188, 0.24137187004089355, 0.14..."
1,physics.optics,quant-ph,"[0.23048396408557892, 0.17416708171367645, 0.1..."
2,astro-ph.EP,quant-ph,"[0.20622789859771729, 0.1806761622428894, 0.16..."
3,physics.comp-ph,quant-ph,"[0.29370442032814026, 0.11989302933216095, 0.1..."
4,physics.atom-ph,quant-ph,"[0.20944005250930786, 0.17453129589557648, 0.1..."
5,cs.AR,quant-ph,"[0.30197227001190186, 0.18922507762908936, 0.1..."
6,cond-mat.quant-gas,quant-ph,"[0.22216995060443878, 0.12882345914840698, 0.1..."
7,physics.acc-ph,quant-ph,"[0.21788351237773895, 0.1835835725069046, 0.12..."
8,math.CV,quant-ph,"[0.16354073584079742, 0.15602795779705048, 0.1..."
9,quant-ph,quant-ph,"[0.3553391695022583, 0.12906913459300995, 0.11..."


In [5]:
correct = results_df["true"] == results_df["predicted"]
accuracy = correct.sum() / len(correct)
print(f"Zero-shot accuracy: {accuracy:.2f}")


Zero-shot accuracy: 0.10


Add in terminal: pixi add conda-forge::tqdm


# Trying new approach for LLM: Fine tuning the LLM 

run in terminal: pixi add conda-forge::accelerate


In [5]:
import pandas as pd
from datasets import Dataset

# Start with your Spark or Pandas dataframe
df_pd = df.toPandas() if not isinstance(df, pd.DataFrame) else df.copy()

# Step 1: Create input text and numerical labels
df_pd["text"] = df_pd["title"] + ". " + df_pd["summary"]
label2id = {label: i for i, label in enumerate(df_pd["main_category"].unique())}
id2label = {v: k for k, v in label2id.items()}
df_pd["label"] = df_pd["main_category"].map(label2id)

# Step 2: Convert to HuggingFace dataset
dataset = Dataset.from_pandas(df_pd[["text", "label"]])
dataset = dataset.train_test_split(test_size=0.2)


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

model_name = "facebook/bart-base"  # You can also use "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

tokenized = dataset.map(tokenize, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label2id))

args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    compute_metrics=None  # Add accuracy/F1 if needed
)

trainer.train()



Map:   0%|          | 0/49912 [00:00<?, ? examples/s]

Map:   0%|          | 0/12478 [00:00<?, ? examples/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


In [None]:
preds = trainer.predict(tokenized["test"])
pred_labels = [id2label[i] for i in preds.predictions.argmax(axis=1)]

# Compare with true labels
from sklearn.metrics import accuracy_score
true_labels = [id2label[i] for i in tokenized["test"]["label"]]
print("Accuracy:", accuracy_score(true_labels, pred_labels))


In [1]:
import torch
print("GPU available:", torch.cuda.is_available())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU only")


GPU available: False
Device name: CPU only
