In [19]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF, PCA
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import col, lower, regexp_replace, split, udf
from pyspark.sql.types import StringType, DoubleType
from pyspark.ml.linalg import Vectors, SparseVector
import numpy as np
import langid
import matplotlib.pyplot as plt

# Helper Functions
def detect_language(text):
    """Detects the language of a text using langid."""
    if text is None:
        return None
    lang, _ = langid.classify(text)
    return lang

def cosine_similarity(v1, v2):
    """Calculates cosine similarity between two vectors, handling SparseVectors."""
    v1 = v1.toArray() if isinstance(v1, SparseVector) else v1
    v2 = v2.toArray() if isinstance(v2, SparseVector) else v2
    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    return float(dot_product / (norm_v1 * norm_v2)) if norm_v1 != 0 and norm_v2 != 0 else 0.0

spark = SparkSession.builder \
    .appName("BD_Analytics_Clustering_Project") \
    .config("spark.driver.maxResultSize", "1000g") \
    .getOrCreate()

# Load Data
df = spark.read.json("./dblp-ref/*.json", multiLine=True).select("id", "title", "abstract")

# Data Exploration (as before - include any relevant EDA steps here)
# ...

# Preprocessing
custom_stop_words = ['doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 
                     'author', 'figure', 'rights', 'reserved', 'permission', 'used', 'using', 
                     'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier', 'PMC', 
                     'CZI', 'www']

lang_detect_udf = udf(detect_language, StringType())
cosine_similarity_udf = udf(cosine_similarity, DoubleType())

df_preprocessed = (df
    .filter(lang_detect_udf(df.abstract) == 'en')  # Keep English documents
    .withColumn("cleaned_abstract", lower(regexp_replace(col("abstract"), r'[^\w\s]', '')))  # Remove punctuation, lowercase
    .withColumn("tokenized_abstract", split(col("cleaned_abstract"), " "))  # Tokenize
)

remover = StopWordsRemover(inputCol="tokenized_abstract", outputCol="filtered_abstract", stopWords=StopWordsRemover().getStopWords() + custom_stop_words)
df_filtered = remover.transform(df_preprocessed)  # Remove stop words

# TF-IDF Vectorization
hashingTF = HashingTF(inputCol="filtered_abstract", outputCol="rawFeatures", numFeatures=20000)
idf = IDF(inputCol="rawFeatures", outputCol="features")
df_tfidf = idf.fit(hashingTF.transform(df_filtered)).transform(hashingTF.transform(df_filtered))

# Dimensionality Reduction with PCA
pca = PCA(k=20, inputCol="features", outputCol="pca_features")
df_pca = pca.fit(df_tfidf.coalesce(1000)).transform(df_tfidf)

# KMeans Clustering (using the optimal k from the elbow method)
optimal_k = 10  # Determined based on the elbow method visualization
kmeans = KMeans(k=optimal_k, seed=1, featuresCol="pca_features")
model = kmeans.fit(df_pca)
df_clustered = model.transform(df_pca)

# Cache and drop interim dataframes
df_tfidf = df_tfidf.drop()
pca = pca.cache()

# Elbow Method to Find Optimal K
costs = []
for k in range(2, 21): 
    kmeans = KMeans(k=k, seed=1, featuresCol="pca_features")
    model = kmeans.fit(df_pca)
    costs.append(model.summary.trainingCost)
plt.plot(range(2, 21), costs, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Cost')
plt.title('Elbow Method for Optimal k')
plt.show()

# ... (Rest of the code for KMeans clustering and search engine as before)


IllegalArgumentException: requirement failed: Cannot aggregate object of size 1600080000 Bytes, as it's bigger than maxResultSize (1073741824 Bytes)

In [None]:
# ... (Previous code for data loading, preprocessing, TF-IDF, and PCA as shown before)

# KMeans Clustering (using the optimal k from the elbow method)
optimal_k = 10 # Determined based on the elbow method visualization

kmeans = KMeans(k=optimal_k, seed=1, featuresCol="pca_features")
model = kmeans.fit(df_pca)
df_clustered = model.transform(df_pca)


# Search Engine Function
def recommend_papers(title, top_n=5):
    """Recommends papers based on title similarity within the same cluster."""
    try:
        # Find the cluster of the given paper title
        paper_cluster = df_clustered.filter(df_clustered.title == title).select("prediction").first()[0]
    except TypeError: # Handle the case where the title is not found
        return []  
    
    # Get all papers in the same cluster
    cluster_papers = df_clustered.filter(df_clustered.prediction == paper_cluster)
    
    # Get the features of the given paper
    paper_features = df_clustered.filter(df_clustered.title == title).select("features").first()[0]
    
    # Calculate cosine similarity and recommend top N papers
    cluster_papers = cluster_papers.withColumn("similarity", cosine_similarity_udf(cluster_papers.features, F.array([paper_features])))
    recommendations = cluster_papers.orderBy("similarity", ascending=False).limit(top_n)
    
    return recommendations.select("title", "similarity").collect()

# Example Usage (replace with a title in your dataset)
query_title = "A new approach of...." 
recommended_papers = recommend_papers(query_title, top_n=5)
print(f"\nRecommendations for '{query_title}':\n")
for rec in recommended_papers:
    print(f"Title: {rec['title']}, Similarity: {rec['similarity']:.4f}")


In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, lower, regexp_replace, split, array
from pyspark.sql.types import StringType, DoubleType
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF, PCA
from pyspark.ml.clustering import KMeans
from pyspark.ml.linalg import Vectors, SparseVector
import numpy as np
import langid
import matplotlib.pyplot as plt

# Helper Functions
def detect_language(text):
    if text is None:
        return None
    lang, _ = langid.classify(text)
    return lang

def cosine_similarity(v1, v2):
    v1_arr = v1.toArray() if isinstance(v1, SparseVector) else v1
    v2_arr = v2.toArray() if isinstance(v2, SparseVector) else v2
    dot_product = np.dot(v1_arr, v2_arr)
    norm_v1 = np.linalg.norm(v1_arr)
    norm_v2 = np.linalg.norm(v2_arr)
    return float(dot_product / (norm_v1 * norm_v2)) if norm_v1 != 0 and norm_v2 != 0 else 0.0

spark = SparkSession.builder \
    .appName("BD_Analytics_Clustering_Project") \
    .config("spark.sql.session.timeZone", "UTC") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.maxResultSize", "2g") \
    .getOrCreate()

# Load Data
df = spark.read.json("./dblp-ref/*.json", multiLine=True).select("id", "title", "abstract")

# Preprocessing
custom_stop_words = ['doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al',
                     'author', 'figure', 'rights', 'reserved', 'permission', 'used', 'using',
                     'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier', 'PMC',
                     'CZI', 'www']

lang_detect_udf = udf(detect_language, StringType())
cosine_similarity_udf = udf(cosine_similarity, DoubleType())

df_preprocessed = (
    df.filter(lang_detect_udf(df.abstract) == 'en')
      .withColumn("cleaned_abstract", lower(regexp_replace(col("abstract"), r'[^\w\s]', '')))
      .withColumn("tokenized_abstract", split(col("cleaned_abstract"), " "))
)

remover = StopWordsRemover(inputCol="tokenized_abstract", outputCol="filtered_abstract",
                           stopWords=StopWordsRemover().getStopWords() + custom_stop_words)
df_filtered = remover.transform(df_preprocessed)

# TF-IDF Vectorization
hashingTF = HashingTF(inputCol="filtered_abstract", outputCol="rawFeatures", numFeatures=20000)
idf = IDF(inputCol="rawFeatures", outputCol="features")
df_tfidf = idf.fit(hashingTF.transform(df_filtered)).transform(hashingTF.transform(df_filtered))

# Dimensionality Reduction with PCA
pca = PCA(k=20, inputCol="features", outputCol="pca_features")
df_pca = pca.fit(df_tfidf).transform(df_tfidf)

# Elbow Method to Find Optimal K
costs = []
for k in range(2, 21):
    kmeans = KMeans(k=k, seed=1, featuresCol="pca_features")
    model = kmeans.fit(df_pca)
    costs.append(model.summary.trainingCost)

plt.plot(range(2, 21), costs, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Cost')
plt.title('Elbow Method for Optimal k')
plt.show()

# KMeans Clustering (using the optimal k from the elbow method)
optimal_k = 10  # Example optimal k value
kmeans = KMeans(k=optimal_k, seed=1, featuresCol="pca_features")
model = kmeans.fit(df_pca)
df_clustered = model.transform(df_pca)

# Search Engine Function
def recommend_papers(title, top_n=5):
    paper_cluster = df_clustered.filter(col("title") == title).select("prediction").first()[0]
    cluster_papers = df_clustered.filter(col("prediction") == paper_cluster)
    paper_features = df_clustered.filter(col("title") == title).select("features").first()[0]

    cluster_papers = cluster_papers.withColumn("similarity", cosine_similarity_udf(cluster_papers.features, array([paper_features])))
    recommendations = cluster_papers.orderBy("similarity", ascending=False).limit(top_n)

    return recommendations.select("title", "similarity").collect()

# Example Usage
query_title = "A new approach of...."
recommended_papers = recommend_papers(query_title, top_n=5)
print(f"\nRecommendations for '{query_title}':\n")
for rec in recommended_papers:
    print(f"Title: {rec['title']}, Similarity: {rec['similarity']:.4f}")

IllegalArgumentException: requirement failed: Cannot aggregate object of size 1600080000 Bytes, as it's bigger than maxResultSize (1073741824 Bytes)