In [86]:
#Importing all
!pip install langid
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF, PCA
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import col, lower, regexp_replace, split, udf
from pyspark.sql.types import StringType, DoubleType
from pyspark.ml.linalg import Vectors, SparseVector
import numpy as np
import langid
import matplotlib.pyplot as plt
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.functions import col, lower, regexp_replace, split
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql import DataFrame
# from pyspark.ml.feature import PCA
from pyspark.ml.feature import VectorSlicer



In [87]:
# spark = SparkSession.builder \
#     .appName("Project 3") \
#     .getOrCreate()

In [88]:
spark = SparkSession.builder \
    .config("spark.sql.shuffle.partitions", 10) \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "8g") \
    .config("spark.driver.maxResultSize", "8g") \
    .master("local[*]") \
    .appName("Project 3") \
    .getOrCreate()

In [89]:
!unzip -u dblp.v10.zip

Archive:  dblp.v10.zip


In [90]:
df = spark.read.json("./dblp-ref/*.json", multiLine=True)

In [91]:
# Show the schema
df.printSchema()
df.show(5)
print(f"Number of records: {df.count()}")

root
 |-- abstract: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: string (nullable = true)
 |-- n_citation: long (nullable = true)
 |-- references: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- title: string (nullable = true)
 |-- venue: string (nullable = true)
 |-- year: long (nullable = true)

+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|            abstract|             authors|                  id|n_citation|          references|               title|               venue|year|
+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|Based on biologic...|[Guoping Pang, La...|4aa69add-3978-480...|         8|[04754a28-6bf4-4d...|Dynamic analysis ...|Mathematics and C...|2008|
|In this paper, a ...

In [92]:
# Display summary statistics
df.describe().show()

# Check for missing values (excluding `isnan`)
df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).show()

# Distribution of citations
df.select("n_citation").describe().show()

# Check for null abstracts and titles
df.filter(df.abstract.isNull() | df.title.isNull()).show()

+-------+--------------------+--------------------+-----------------+--------------------+--------------------+------------------+
|summary|            abstract|                  id|       n_citation|               title|               venue|              year|
+-------+--------------------+--------------------+-----------------+--------------------+--------------------+------------------+
|  count|                   4|                   4|                4|                   4|                   4|                 4|
|   mean|                NULL|                NULL|             14.5|                NULL|                NULL|           2011.25|
| stddev|                NULL|                NULL|23.96525262402492|                NULL|                NULL|3.9475730941089733|
|    min|AdaBoost algorith...|00127ee2-cb05-48c...|                0|A Heterogeneous S...|Mathematics and C...|              2008|
|    max|The purpose of th...|4ab3735c-80f1-472...|               50|Preliminary De

In [93]:
# Function to detect language using langid
def detect_language(text):
    if text is None:
        return None
    lang, _ = langid.classify(text)
    return lang

# Registering UDF
lang_detect_udf = udf(detect_language, StringType())

# Add a new column for language detection
df = df.withColumn("language", lang_detect_udf(df.abstract))

# Filter only English documents
df = df.filter(df.language == 'en')

In [94]:
# Custom stop words
custom_stop_words = ['doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 
                     'author', 'figure', 'rights', 'reserved', 'permission', 'used', 'using', 
                     'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier', 'PMC', 
                     'CZI', 'www']

In [95]:
# Lowercase and remove punctuation
df_cleaned = df.withColumn("cleaned_abstract", lower(col("abstract")))
df_cleaned = df_cleaned.withColumn("cleaned_abstract", regexp_replace(col("cleaned_abstract"), r'[!()\-\[\]{};:\'",<>./?@#$%^&*_~]', ''))

In [96]:
# Tokenize the text
df_tokenized = df_cleaned.withColumn("tokenized_abstract", split(col("cleaned_abstract"), " "))

In [97]:
# Remove stop words
remover = StopWordsRemover(inputCol="tokenized_abstract", outputCol="filtered_abstract", 
                           stopWords=StopWordsRemover().getStopWords() + custom_stop_words)
df_filtered = remover.transform(df_tokenized)

In [122]:
# Show the cleaned dataframe
# df_filtered.select("abstract", "cleaned_abstract", "tokenized_abstract", "filtered_abstract").show(truncate=False)

In [123]:
# Apply TF
hashingTF = HashingTF(inputCol="filtered_abstract", outputCol="rawFeatures", numFeatures=20000)
df_featurized = hashingTF.transform(df_filtered)
# df_featurized.show()

In [124]:
# Apply IDF
idf = IDF(inputCol="rawFeatures", outputCol="features")
idf_model = idf.fit(df_featurized)
df_vectorized = idf_model.transform(df_featurized)
df_vectorized.show()

+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|            abstract|             authors|                  id|n_citation|          references|               title|               venue|year|language|    cleaned_abstract|  tokenized_abstract|   filtered_abstract|         rawFeatures|            features|
+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Based on biologic...|[Guoping Pang, La...|4aa69add-3978-480...|         8|[04754a28-6bf4-4d...|Dynamic analysis ...|Mathematics and C...|2008|      en|based on biologic...|[based, on, biolo...|[based, biologica...|(20000,[28,

In [125]:
# Select only the columns we need
df_vectorized = df_vectorized.select("id", "title", "features")
df_vectorized.show(5)

+--------------------+--------------------+--------------------+
|                  id|               title|            features|
+--------------------+--------------------+--------------------+
|4aa69add-3978-480...|Dynamic analysis ...|(20000,[28,42,274...|
|4ab3735c-80f1-472...|A new approach of...|(20000,[78,274,46...|
|00127ee2-cb05-48c...|Preliminary Desig...|(20000,[1072,1241...|
|001eef4f-1d00-4ae...|A Heterogeneous S...|(20000,[193,274,2...|
+--------------------+--------------------+--------------------+



In [126]:
#Clustering 

def sample_data(df, fraction, max_attempts=5):
    attempt = 0
    sampled_df = df.sample(fraction=fraction, seed=42)
    while sampled_df.count() == 0 and attempt < max_attempts:
        fraction *= 2  # Increase fraction to get more data
        sampled_df = df.sample(fraction=fraction, seed=42)
        attempt += 1
    if sampled_df.count() == 0:
        raise ValueError("Sampled DataFrame is empty after several attempts.")
    return sampled_df

# Sample the data with initial fraction 0.1
df_sampled = sample_data(df_vectorized, 0.1)
print(df_sampled)
print(f"Sampled DataFrame count: {df_sampled.count()}")
df_sampled.show(5)


DataFrame[id: string, title: string, features: vector]
Sampled DataFrame count: 3
+--------------------+--------------------+--------------------+
|                  id|               title|            features|
+--------------------+--------------------+--------------------+
|4aa69add-3978-480...|Dynamic analysis ...|(20000,[28,42,274...|
|00127ee2-cb05-48c...|Preliminary Desig...|(20000,[1072,1241...|
|001eef4f-1d00-4ae...|A Heterogeneous S...|(20000,[193,274,2...|
+--------------------+--------------------+--------------------+



In [127]:

# # Check if df_sampled is a Spark DataFrame
# if isinstance(df_sampled, DataFrame):
#     print("df_sampled is a Spark DataFrame")
# else:
#     print("df_sampled is not a Spark DataFrame")

# # Check if df_sampled has a column named "features"
# if "features" in df_sampled.columns:
#     print("df_sampled has a column named 'features'")
# else:
#     print("df_sampled does not have a column named 'features'")

In [128]:
# Use VectorSlicer to select relevant features
slicer = VectorSlicer(inputCol="features", outputCol="sliced_features", indices=[i for i in range(0, 50)])
df_sliced = slicer.transform(df_sampled)

# Update PCA to use sliced features
pca = PCA(k=20, inputCol="sliced_features", outputCol="pca_features")

# Repartition the data to increase parallelism
df_sampled_repartitioned = df_sliced.repartition(100)

# Persist the DataFrame to speed up the computation
df_sampled_repartitioned.persist()

# Fit the PCA model
pca_model = pca.fit(df_sampled_repartitioned)

# Transform the data using the PCA model
df_pca = pca_model.transform(df_sampled_repartitioned)

# Show the result (for debugging purposes)
df_pca.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                  id|               title|            features|     sliced_features|        pca_features|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|4aa69add-3978-480...|Dynamic analysis ...|(20000,[28,42,274...|(50,[28,42],[0.91...|[-1.2958307800931...|
|00127ee2-cb05-48c...|Preliminary Desig...|(20000,[1072,1241...|          (50,[],[])|[0.0,0.0,0.0,0.0,...|
|001eef4f-1d00-4ae...|A Heterogeneous S...|(20000,[193,274,2...|          (50,[],[])|[0.0,0.0,0.0,0.0,...|
+--------------------+--------------------+--------------------+--------------------+--------------------+



In [129]:
# # Apply PCA to reduce dimensions (e.g., to 20 components)
# pca = PCA(k=20, inputCol="features", outputCol="pca_features")
# pca_model = pca.fit(df_sampled)
# df_pca = pca_model.transform(df_sampled)
# print(pca)

In [133]:
df_pca.select("pca_features").show(truncate=False)


+------------------------------------------------------------------------------------------------+
|pca_features                                                                                    |
+------------------------------------------------------------------------------------------------+
|[-1.295830780093199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]|
|[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]               |
|[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]               |
+------------------------------------------------------------------------------------------------+



In [134]:
from pyspark.sql.types import IntegerType

def vector_length(vector):
    return len(vector)

vector_length_udf = udf(vector_length, IntegerType())
df_pca = df_pca.withColumn("pca_feature_length", vector_length_udf(df_pca["pca_features"]))
df_pca.select("pca_feature_length").distinct().show()

+------------------+
|pca_feature_length|
+------------------+
|                20|
+------------------+



In [156]:
from pyspark.sql.functions import col, size, udf, lit 
from pyspark.sql.types import IntegerType
from pyspark.ml.linalg import Vectors, VectorUDT, SparseVector
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import VectorAssembler

# Find the maximum length of vectors in pca_features
def vector_size_udf(v):
  """Returns the size of a vector, handling both dense and sparse vectors."""
  if isinstance(v, SparseVector):
    return v.size
  else:
    return len(v)

vector_size_udf = udf(vector_size_udf, IntegerType())
max_length = df_pca.select(vector_size_udf(col("pca_features")).alias("vector_size")).agg({"vector_size": "max"}).collect()[0][0]
print(f"Max length of pca_features: {max_length}")

# Pad shorter vectors with zeros to match the maximum length (Corrected)
def pad_vector(vec, max_length):
  if isinstance(vec, SparseVector):
    new_indices = vec.indices.tolist()
    new_values = vec.values.tolist()
  else:
    new_indices = list(range(len(vec)))  # Convert range to list
    new_values = vec.toArray().tolist()
  padding = [0.0] * (max_length - len(vec))
  return Vectors.sparse(max_length, new_indices + list(range(len(vec), max_length)), new_values + padding) 

pad_vector_udf = udf(pad_vector, VectorUDT())

# Use lit() to create a column of literal values for max_length
df_pca = df_pca.withColumn("pca_features", pad_vector_udf(col("pca_features"), lit(max_length)))


# Assemble the features
assembler = VectorAssembler(inputCols=["pca_features"], outputCol="assembled_features")
df_data = assembler.transform(df_pca)

# Run KMeans
kmeans = KMeans(k=5, seed=1, featuresCol="assembled_features", predictionCol="cluster")
kmeans_model = kmeans.fit(df_data)

# Print cluster centers
cluster_centers = kmeans_model.clusterCenters()
print("Cluster Centers:")
for center in cluster_centers:
    print(center)

# Evaluate clustering
predictions = kmeans_model.transform(df_data).select("cluster")
evaluator = ClusteringEvaluator(predictionCol="cluster", featuresCol="assembled_features", metricName="silhouette", distanceMeasure="cosine")
silhouette = evaluator.evaluate(predictions)
print(f"Silhouette Score: {silhouette}")

# Print the number of samples in each cluster
cluster_counts = predictions.groupBy("cluster").count().show()

# Print the cluster labels
kmeans_results = kmeans_model.transform(df_data).select("cluster", "pca_features")
kmeans_results.show(5)


PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/tmp/ipykernel_110/704885081.py", line 29, in pad_vector
TypeError: unsupported operand type(s) for +: 'range' and 'list'


In [148]:
# KMeans Clustering (using the optimal k from the elbow method)
optimal_k = 10  # Example optimal k value
kmeans = KMeans(k=optimal_k, seed=1, featuresCol="pca_features")
model = kmeans.fit(df_pca)
df_clustered = model.transform(df_pca)

Py4JJavaError: An error occurred while calling o2545.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 29 in stage 1337.0 failed 1 times, most recent failure: Lost task 29.0 in stage 1337.0 (TID 26003) (0797c18b42f3 executor driver): java.lang.ArrayIndexOutOfBoundsException: Index 2 out of bounds for length 2
	at org.apache.spark.ml.clustering.KMeansAggregator.euclideanUpdateInPlace(KMeans.scala:733)
	at org.apache.spark.ml.clustering.KMeansAggregator.add(KMeans.scala:706)
	at org.apache.spark.ml.clustering.KMeans.$anonfun$trainWithBlock$7(KMeans.scala:510)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at org.apache.spark.ml.clustering.KMeans.$anonfun$trainWithBlock$6(KMeans.scala:510)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:858)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:840)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2419)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2438)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2463)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1049)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1048)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$collectAsMap$1(PairRDDFunctions.scala:738)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:737)
	at org.apache.spark.ml.clustering.KMeans.trainWithBlock(KMeans.scala:524)
	at org.apache.spark.ml.clustering.KMeans.$anonfun$fit$1(KMeans.scala:380)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.clustering.KMeans.fit(KMeans.scala:371)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: java.lang.ArrayIndexOutOfBoundsException: Index 2 out of bounds for length 2
	at org.apache.spark.ml.clustering.KMeansAggregator.euclideanUpdateInPlace(KMeans.scala:733)
	at org.apache.spark.ml.clustering.KMeansAggregator.add(KMeans.scala:706)
	at org.apache.spark.ml.clustering.KMeans.$anonfun$trainWithBlock$7(KMeans.scala:510)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at org.apache.spark.ml.clustering.KMeans.$anonfun$trainWithBlock$6(KMeans.scala:510)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:858)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	... 1 more


In [149]:
import numpy as np

# ... (cosine_similarity function definition)

def recommend_papers(title, top_n=5):
  # Assuming df_clustered is a pandas DataFrame
  paper_cluster = df_clustered[df_clustered["title"] == title]["prediction"].iloc[0]
  cluster_papers = df_clustered[df_clustered["prediction"] == paper_cluster]
  paper_features = df_clustered[df_clustered["title"] == title]["pca_features"].iloc[0]

  similarities = []
  for _, row in cluster_papers.iterrows():
    other_title = row["title"]
    other_features = row["pca_features"]
    similarity = cosine_similarity(paper_features, other_features)
    similarities.append((other_title, similarity))

  similarities.sort(key=lambda x: x[1], reverse=True)
  return similarities[:top_n]
