In [None]:
import torch 
from transformers import pipeline

In [None]:
# https://developer.apple.com/metal/pytorch/

if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

In [None]:
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

In [None]:
sequence_to_classify = "one day I will see the world"
candidate_labels = ['travel', 'cooking', 'dancing']

classifier(sequence_to_classify, candidate_labels)

In [None]:
sequence_to_classify = "one day I will see the world"
candidate_labels = ['home', 'earth', 'cat']
classifier(sequence_to_classify, candidate_labels)

In [None]:
sequence_to_classify = "plane tickets are becoming cheaper"
candidate_labels = ['travel', 'cooking', 'dancing']
classifier(sequence_to_classify, candidate_labels)

In [None]:
sequence_to_classify = "Assisting humans with AI agents in maneuvering micromobility devices presents a viable solution for enhancing safety and efficiency. In this work, we present a scalable urban simulation solution to advance autonomous micromobility"
candidate_labels = ['computer science', 'physics', 'economics', 'mathematics']
classifier(sequence_to_classify, candidate_labels)

In [None]:
sequence_to_classify = "We describe a comma 2-comonad on the 2-category whose objects are functors, 1-cell are colax squares and 2-cells are their transformations"
candidate_labels = ['computer science', 'physics', 'economics', 'mathematics']
classifier(sequence_to_classify, candidate_labels)

In [None]:
sequence_to_classify = "We describe a comma 2-comonad on the 2-category whose objects are functors, 1-cell are colax squares and 2-cells are their transformations.  \
We give a complete description of the Eilenberg-Moore 2-category of colax coalgebras, colax morphisms between them and their transformations and we show how many \
fundamental constructions in formal category theory like adjoint triples, distributive laws, comprehension structures, Frobenius functors etc. naturally fit in this context."
candidate_labels = ['category theory', 'combinatorics', 'commutative algebra', 'algebraic topology']
classifier(sequence_to_classify, candidate_labels, multi_label = True)

# 02 Test on actual data

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_set, udf
from pyspark.sql.types import StructType, StructField, StringType, FloatType

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
df_pub = spark.read.parquet("data/df_all_cleaned")

In [None]:
df_pub.count()

In [None]:
# df_pub.write.option("compression", "uncompressed").parquet("data/df_all_cleaned_uncompressed")

In [None]:
df_categories = spark.read.parquet("data/arxiv_categories.parquet.gzip")

In [None]:
row = df_categories.agg(collect_set("group").alias("groups")).collect()[0]

categories = row['groups']

print(categories)

In [None]:
row = df_categories.agg(collect_set("category_id").alias("subcategories")).collect()[0]

subcategories = row['subcategories']

print(subcategories)

In [None]:
len(subcategories)

In [None]:
df_pub.show()

In [None]:
text = df_pub.select(col('summary')).collect()[2]['summary']
text

In [None]:
sequence_to_classify = text
candidate_labels = categories

result = classifier(sequence_to_classify, candidate_labels)

In [None]:
result

In [None]:
result['labels'][0]

In [None]:
round(result['scores'][0], 4)

In [None]:
def predict(text, candidate_labels = categories):
    result = classifier(text, candidate_labels)
    pred_label = result['labels'][0]
    score = float(round(result['scores'][0], 4))
    return pred_label, score

In [None]:
# Define schema for the struct return type
schema = StructType([
    StructField("pred_label", StringType(), True),
    StructField("score", FloatType(), True)
])

In [None]:
predict_udf = udf(predict, schema)

In [None]:
df_pub_subset = df_pub.limit(10)
df_pub_subset.show()

In [None]:
globals()['models_loaded'] = False
globals()['my_model'] = None

# Define your UDF
def predict(text, candidate_labels = categories):
    # Check if model has been loaded already
    if not globals().get('models_loaded', False):
        # globals()['my_model'] = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
        globals()['my_model'] = pipeline("zero-shot-classification", model="valhalla/distilbart-mnli-12-3")
        globals()['models_loaded'] = True

    model = globals()['my_model']
    result = model(text, candidate_labels)
    pred_label = result['labels'][0]
    score = float(round(result['scores'][0], 4))
    return pred_label, score

In [None]:
predict_udf = udf(predict, schema)

In [None]:
predict(text)

In [None]:
df_with_predictions = df_pub_subset.withColumn("pred", predict_udf(col("summary")))

In [None]:
df_with_labels = df_with_predictions.select(
    "*",
    col("pred.pred_label"),
    col("pred.score")
).drop("pred")

In [None]:
df_with_labels.show()

In [None]:
def process(time, rdd, colname = "summary"):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    # Convert to data frame
    df = spark.read.json(rdd)
    #df.show()
    
    # Utilize our predict function
    df_withpreds = df.withColumn("pred", predict_udf(
        struct([df[colname])
    ))
    df_withpreds.show()

In [None]:
df_with_predictions = df_pub_subset.foreachRDD(process)

In [None]:
import pandas as pd
from transformers import pipeline
from pyspark.sql.types import StructType, StructField, StringType, FloatType

candidate_labels = categories

schema = StructType([
    StructField("summary", StringType(), True),
    StructField("predicted_label", StringType(), True),
    StructField("confidence_score", FloatType(), True)
])

def classify_partition(pdf_iter):
    # Only load the model once per Python worker
    if not globals().get("models_loaded", False):
        globals()["classifier"] = pipeline("zero-shot-classification", model="valhalla/distilbart-mnli-12-3", device=-1)
        globals()["models_loaded"] = True

    classifier = globals()["classifier"]

    for pdf in pdf_iter:
        results = []
        for text in pdf["summary"]:
            res = classifier(text, candidate_labels)
            label = res["labels"][0]
            score = float(res["scores"][0])
            results.append((text, label, score))

        yield pd.DataFrame(results, columns=["summary", "predicted_label", "confidence_score"])

In [None]:
# df = spark.createDataFrame([("A new method for protein folding",)], ["summary"])

In [None]:
df_with_labels = df_pub_subset.mapInPandas(classify_partition, schema=schema)
df_with_labels.show()