In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Initialize Spark session
spark = SparkSession.builder.appName("BD_Analytics_Clustering_Project").getOrCreate()

# Load the dataset
df = spark.read.json("./dblp-ref/*.json", multiLine=True)

# Show the schema to understand the structure
df.printSchema()

# Display a sample of the data
df.show(5)

# Count the number of records
print(f"Number of records: {df.count()}")

# Display summary statistics
df.describe().show()

# Check for missing values (excluding `isnan`)
df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).show()

# Distribution of citations
df.select("n_citation").describe().show()

# Check for null abstracts and titles
df.filter(df.abstract.isNull() | df.title.isNull()).show()


root
 |-- abstract: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: string (nullable = true)
 |-- n_citation: long (nullable = true)
 |-- references: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- title: string (nullable = true)
 |-- venue: string (nullable = true)
 |-- year: long (nullable = true)

+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|            abstract|             authors|                  id|n_citation|          references|               title|               venue|year|
+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|Based on biologic...|[Guoping Pang, La...|4aa69add-3978-480...|         8|[04754a28-6bf4-4d...|Dynamic analysis ...|Mathematics and C...|2008|
|In this paper, a ...

In [7]:
!pip install langid

Collecting langid
  Downloading langid-1.1.6.tar.gz (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m885.8 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: langid
  Building wheel for langid (setup.py) ... [?25ldone
[?25h  Created wheel for langid: filename=langid-1.1.6-py3-none-any.whl size=1941172 sha256=98be2d27530f3617580d67022e97a0c311c9ea18d14e781467fe0672f6fe89bf
  Stored in directory: /home/jovyan/.cache/pip/wheels/32/6a/b6/b7eb43a6ad55b139c15c5daa29f3707659cfa6944d3c696f5b
Successfully built langid
Installing collected packages: langid
Successfully installed langid-1.1.6


In [6]:
import langid
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Function to detect language using langid
def detect_language(text):
    if text is None:
        return None
    lang, _ = langid.classify(text)
    return lang

# Registering UDF
lang_detect_udf = udf(detect_language, StringType())

# Add a new column for language detection
df = df.withColumn("language", lang_detect_udf(df.abstract))

# Filter only English documents
df = df.filter(df.language == 'en')


ModuleNotFoundError: No module named 'cld3'

In [None]:
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import col, lower, regexp_replace

# Custom stop words
custom_stop_words = ['doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 
                     'author', 'figure', 'rights', 'reserved', 'permission', 'used', 'using', 
                     'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier', 'PMC', 
                     'CZI', 'www']

# Lowercase and remove punctuation
df_cleaned = df.withColumn("cleaned_abstract", lower(col("abstract")))
df_cleaned = df_cleaned.withColumn("cleaned_abstract", regexp_replace(col("cleaned_abstract"), r'[!()-[\]{};:\'",<>./?@#$%^&*_~]', ''))

# Remove stop words
remover = StopWordsRemover(inputCol="cleaned_abstract", outputCol="filtered_abstract", 
                           stopWords=StopWordsRemover().getStopWords() + custom_stop_words)
df_cleaned = remover.transform(df_cleaned)


In [None]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF

# Tokenize the text
tokenizer = Tokenizer(inputCol="filtered_abstract", outputCol="words")
df_tokenized = tokenizer.transform(df_cleaned)

# Apply TF
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20000)
df_featurized = hashingTF.transform(df_tokenized)

# Apply IDF
idf = IDF(inputCol="rawFeatures", outputCol="features")
idf_model = idf.fit(df_featurized)
df_vectorized = idf_model.transform(df_featurized)

# Select only the columns we need
df_vectorized = df_vectorized.select("id", "title", "features")
df_vectorized.show(5)


In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import PCA

# Apply PCA to reduce dimensions while retaining 95% of the variance
pca = PCA(k=100, inputCol="features", outputCol="pca_features")
pca_model = pca.fit(df_vectorized)
df_pca = pca_model.transform(df_vectorized)

# Determine the optimal number of clusters using the elbow method
costs = []
for k in range(2, 21):
    kmeans = KMeans(k=k, seed=1, featuresCol="pca_features")
    model = kmeans.fit(df_pca)
    cost = model.summary.trainingCost
    costs.append(cost)

# Plot the elbow curve
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(range(2, 21), costs, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Cost')
plt.title('Elbow Method For Optimal k')
plt.show()


In [None]:
# Assume the optimal number of clusters is determined to be 10
optimal_k = 10

# Train the K-means model
kmeans = KMeans(k=optimal_k, seed=1, featuresCol="pca_features")
model = kmeans.fit(df_pca)

# Make predictions
df_clusters = model.transform(df_pca)
df_clusters.select("id", "title", "prediction").show(5)


In [None]:
from pyspark.ml.linalg import Vectors
import numpy as np
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

# Function to calculate cosine similarity
def cosine_similarity(v1, v2):
    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    return dot_product / (norm_v1 * norm_v2)

cosine_similarity_udf = udf(cosine_similarity, DoubleType())

# Build search engine function
def recommend_papers(title, top_n=5):
    # Find the cluster of the given paper title
    paper_cluster = df_clusters.filter(df_clusters.title == title).select("prediction").collect()[0][0]
    
    # Get all papers in the same cluster
    cluster_papers = df_clusters.filter(df_clusters.prediction == paper_cluster)
    
    # Get the features of the given paper
    paper_features = df_clusters.filter(df_clusters.title == title).select("features").collect()[0][0]
    
    # Calculate cosine similarity and recommend top N papers
    cluster_papers = cluster_papers.withColumn("similarity", cosine_similarity_udf(cluster_papers.features, Vectors.dense(paper_features.toArray())))
    recommendations = cluster_papers.orderBy("similarity", ascending=False).limit(top_n)
    
    return recommendations.select("title", "similarity").collect()

# Example usage
recommended_papers = recommend_papers("A new approach of....", top_n=5)
for rec in recommended_papers:
    print(f"Title: {rec['title']}, Similarity: {rec['similarity']}")
