In [1]:
#Importing all
!pip install langid
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF, PCA
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import col, lower, regexp_replace, split, udf
from pyspark.sql.types import StringType, DoubleType
from pyspark.ml.linalg import Vectors, SparseVector
import numpy as np
import langid
import matplotlib.pyplot as plt
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.functions import col, lower, regexp_replace, split
from pyspark.ml.feature import StopWordsRemover


Collecting langid
  Using cached langid-1.1.6-py3-none-any.whl
Installing collected packages: langid
Successfully installed langid-1.1.6


In [2]:
# spark = SparkSession.builder \
#     .appName("Project 3") \
#     .getOrCreate()

In [3]:
# Create a SparkSession with the required configuration
spark = SparkSession.builder \
    .config("spark.sql.shuffle.partitions" ,10) \
    .config("spark.executor.memory" , "8g") \
    .master("local[*]") \
    .appName("Project 3") \
    .getOrCreate()

In [4]:
!unzip -u dblp.v10.zip

Archive:  dblp.v10.zip


In [5]:
df = spark.read.json("./dblp-ref/*.json", multiLine=True)

In [6]:
# Show the schema
df.printSchema()
df.show(5)
print(f"Number of records: {df.count()}")

root
 |-- abstract: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: string (nullable = true)
 |-- n_citation: long (nullable = true)
 |-- references: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- title: string (nullable = true)
 |-- venue: string (nullable = true)
 |-- year: long (nullable = true)

+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|            abstract|             authors|                  id|n_citation|          references|               title|               venue|year|
+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|Based on biologic...|[Guoping Pang, La...|4aa69add-3978-480...|         8|[04754a28-6bf4-4d...|Dynamic analysis ...|Mathematics and C...|2008|
|In this paper, a ...

In [7]:
# Display summary statistics
df.describe().show()

# Check for missing values (excluding `isnan`)
df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).show()

# Distribution of citations
df.select("n_citation").describe().show()

# Check for null abstracts and titles
df.filter(df.abstract.isNull() | df.title.isNull()).show()

+-------+--------------------+--------------------+-----------------+--------------------+--------------------+------------------+
|summary|            abstract|                  id|       n_citation|               title|               venue|              year|
+-------+--------------------+--------------------+-----------------+--------------------+--------------------+------------------+
|  count|                   4|                   4|                4|                   4|                   4|                 4|
|   mean|                NULL|                NULL|             14.5|                NULL|                NULL|           2011.25|
| stddev|                NULL|                NULL|23.96525262402492|                NULL|                NULL|3.9475730941089733|
|    min|AdaBoost algorith...|00127ee2-cb05-48c...|                0|A Heterogeneous S...|Mathematics and C...|              2008|
|    max|The purpose of th...|4ab3735c-80f1-472...|               50|Preliminary De

In [8]:
# Function to detect language using langid
def detect_language(text):
    if text is None:
        return None
    lang, _ = langid.classify(text)
    return lang

# Registering UDF
lang_detect_udf = udf(detect_language, StringType())

# Add a new column for language detection
df = df.withColumn("language", lang_detect_udf(df.abstract))

# Filter only English documents
df = df.filter(df.language == 'en')

In [9]:
# Custom stop words
custom_stop_words = ['doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 
                     'author', 'figure', 'rights', 'reserved', 'permission', 'used', 'using', 
                     'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier', 'PMC', 
                     'CZI', 'www']

In [10]:
# Lowercase and remove punctuation
df_cleaned = df.withColumn("cleaned_abstract", lower(col("abstract")))
df_cleaned = df_cleaned.withColumn("cleaned_abstract", regexp_replace(col("cleaned_abstract"), r'[!()\-\[\]{};:\'",<>./?@#$%^&*_~]', ''))

In [11]:
# Tokenize the text
df_tokenized = df_cleaned.withColumn("tokenized_abstract", split(col("cleaned_abstract"), " "))

In [12]:
# Remove stop words
remover = StopWordsRemover(inputCol="tokenized_abstract", outputCol="filtered_abstract", 
                           stopWords=StopWordsRemover().getStopWords() + custom_stop_words)
df_filtered = remover.transform(df_tokenized)

In [13]:
# Show the cleaned dataframe
df_filtered.select("abstract", "cleaned_abstract", "tokenized_abstract", "filtered_abstract").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
# Apply TF
hashingTF = HashingTF(inputCol="filtered_abstract", outputCol="rawFeatures", numFeatures=20000)
df_featurized = hashingTF.transform(df_filtered)
# df_featurized.show()

In [15]:
# Apply IDF
idf = IDF(inputCol="rawFeatures", outputCol="features")
idf_model = idf.fit(df_featurized)
df_vectorized = idf_model.transform(df_featurized)
df_vectorized.show()

+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|            abstract|             authors|                  id|n_citation|          references|               title|               venue|year|language|    cleaned_abstract|  tokenized_abstract|   filtered_abstract|         rawFeatures|            features|
+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Based on biologic...|[Guoping Pang, La...|4aa69add-3978-480...|         8|[04754a28-6bf4-4d...|Dynamic analysis ...|Mathematics and C...|2008|      en|based on biologic...|[based, on, biolo...|[based, biologica...|(20000,[28,

In [16]:
# Select only the columns we need
df_vectorized = df_vectorized.select("id", "title", "features")
df_vectorized.show(5)

+--------------------+--------------------+--------------------+
|                  id|               title|            features|
+--------------------+--------------------+--------------------+
|4aa69add-3978-480...|Dynamic analysis ...|(20000,[28,42,274...|
|4ab3735c-80f1-472...|A new approach of...|(20000,[78,274,46...|
|00127ee2-cb05-48c...|Preliminary Desig...|(20000,[1072,1241...|
|001eef4f-1d00-4ae...|A Heterogeneous S...|(20000,[193,274,2...|
+--------------------+--------------------+--------------------+



In [17]:
#Clustering 

def sample_data(df, fraction, max_attempts=5):
    attempt = 0
    sampled_df = df.sample(fraction=fraction, seed=42)
    while sampled_df.count() == 0 and attempt < max_attempts:
        fraction *= 2  # Increase fraction to get more data
        sampled_df = df.sample(fraction=fraction, seed=42)
        attempt += 1
    if sampled_df.count() == 0:
        raise ValueError("Sampled DataFrame is empty after several attempts.")
    return sampled_df

# Sample the data with initial fraction 0.1
df_sampled = sample_data(df_vectorized, 0.1)
print(df_sampled)
print(f"Sampled DataFrame count: {df_sampled.count()}")
df_sampled.show(5)


DataFrame[id: string, title: string, features: vector]
Sampled DataFrame count: 3
+--------------------+--------------------+--------------------+
|                  id|               title|            features|
+--------------------+--------------------+--------------------+
|4aa69add-3978-480...|Dynamic analysis ...|(20000,[28,42,274...|
|00127ee2-cb05-48c...|Preliminary Desig...|(20000,[1072,1241...|
|001eef4f-1d00-4ae...|A Heterogeneous S...|(20000,[193,274,2...|
+--------------------+--------------------+--------------------+



In [18]:
from pyspark.sql import DataFrame

# Check if df_sampled is a Spark DataFrame
if isinstance(df_sampled, DataFrame):
    print("df_sampled is a Spark DataFrame")
else:
    print("df_sampled is not a Spark DataFrame")

# Check if df_sampled has a column named "features"
if "features" in df_sampled.columns:
    print("df_sampled has a column named 'features'")
else:
    print("df_sampled does not have a column named 'features'")

df_sampled is a Spark DataFrame
df_sampled has a column named 'features'


In [19]:
from pyspark.ml.feature import PCA


# Create a PCA object with k=20, inputCol="features", outputCol="pca_features"
pca = PCA(k=20, inputCol="features", outputCol="pca_features")

# Repartition the data to increase parallelism
df_sampled_repartitioned = df_sampled.repartition(500)

# Fit the PCA model
pca_model = pca.fit(df_sampled_repartitioned)

# Transform the data using the PCA model
df_pca = pca_model.transform(df_sampled_repartitioned)

IllegalArgumentException: requirement failed: Cannot aggregate object of size 1600080000 Bytes, as it's bigger than maxResultSize (1073741824 Bytes)

In [None]:
# # Apply PCA to reduce dimensions (e.g., to 20 components)
# pca = PCA(k=20, inputCol="features", outputCol="pca_features")
# pca_model = pca.fit(df_sampled)
# df_pca = pca_model.transform(df_sampled)
# print(pca)

In [None]:
# Elbow Method to Find Optimal K
costs = []
for k in range(2, 21):
    kmeans = KMeans(k=k, seed=1, featuresCol="pca_features")
    model = kmeans.fit(df_pca)
    costs.append(model.summary.trainingCost)

plt.plot(range(2, 21), costs, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Cost')
plt.title('Elbow Method for Optimal k')
plt.show()


In [None]:
# KMeans Clustering (using the optimal k from the elbow method)
optimal_k = 10  # Example optimal k value
kmeans = KMeans(k=optimal_k, seed=1, featuresCol="pca_features")
model = kmeans.fit(df_pca)
df_clustered = model.transform(df_pca)

In [None]:
# Search Engine Function
def recommend_papers(title, top_n=5):
    paper_cluster = df_clustered.filter(col("title") == title).select("prediction").first()[0]
    cluster_papers = df_clustered.filter(col("prediction") == paper_cluster)
    paper_features = df_clustered.filter(col("title") == title).select("features").first()[0]

    cluster_papers = cluster_papers.withColumn("similarity", cosine_similarity_udf(cluster_papers.features, array([paper_features])))
    recommendations = cluster_papers.orderBy("similarity", ascending=False).limit(top_n)

    return recommendations.select("title", "similarity").collect()

# Example Usage
query_title = "A new approach of...."
recommended_papers = recommend_papers(query_title, top_n=5)
print(f"\nRecommendations for '{query_title}':\n")
for rec in recommended_papers:
    print(f"Title: {rec['title']}, Similarity: {rec['similarity']:.4f}")