In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, regexp_replace, lower
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, PCA, Word2Vec
from pyspark.ml.clustering import KMeans
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import ArrayType, StringType
import re

# Initialize Spark Session
spark = SparkSession.builder.appName("BD_Analytics_Clustering_Project").getOrCreate()

# Step 1: Read the dataset
file_path = "/path/to/DBLP_Citation_network_V10.json"
df = spark.read.json(file_path)

# Display schema and basic stats
df.printSchema()
df.show(5)

# Step 2: Exploratory Data Analysis
df.select("title", "abstract").describe().show()

# Step 3: Keep only English documents
# (Assuming there's a language field, if not, additional language detection logic is needed)
df = df.filter(df.language == "English")

# Step 4: Preprocessing
custom_stop_words = ['doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 
                     'author', 'figure','rights', 'reserved', 'permission', 'used', 'using', 
                     'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier', 'PMC', 'CZI', 'www']

# User-defined function to remove custom stop words
def remove_custom_stop_words(tokens):
    return [token for token in tokens if token not in custom_stop_words]

remove_custom_stop_words_udf = udf(remove_custom_stop_words, ArrayType(StringType()))

# Tokenization
tokenizer = Tokenizer(inputCol="abstract", outputCol="words")
df = tokenizer.transform(df)

# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
df = remover.transform(df)

# Remove custom stop words
df = df.withColumn("filtered_words", remove_custom_stop_words_udf(col("filtered_words")))

# Remove punctuation and convert to lower case
regex_pattern = re.compile('[!()-[\]{};:"\\,<>./?@#$%^&*_~]')
df = df.withColumn("cleaned_words", regexp_replace(col("filtered_words"), regex_pattern, ''))
df = df.withColumn("cleaned_words", lower(col("cleaned_words")))

# Step 5: Vectorization
# TF-IDF
vectorizer = CountVectorizer(inputCol="cleaned_words", outputCol="raw_features")
vectorizer_model = vectorizer.fit(df)
df = vectorizer_model.transform(df)

idf = IDF(inputCol="raw_features", outputCol="features")
idf_model = idf.fit(df)
df = idf_model.transform(df)

# Word2Vec (optional, uncomment if needed)
# word2Vec = Word2Vec(inputCol="cleaned_words", outputCol="word2vec_features")
# model = word2Vec.fit(df)
# df = model.transform(df)

# Step 6: Clustering
# PCA for dimensionality reduction
pca = PCA(k=20, inputCol="features", outputCol="pca_features")
pca_model = pca.fit(df)
df = pca_model.transform(df)

# Extract values array from pca_features and filter rows
df = df.withColumn("values", col("pca_features").values)
df = df.filter(size(col("values")) == 20)

# K-means clustering
kmeans = KMeans(k=5, seed=1, featuresCol="pca_features")
model = kmeans.fit(df)
df = model.transform(df)

# Step 7: Search engine
def cosine_similarity(vec1, vec2):
    return float(vec1.dot(vec2) / (Vectors.norm(vec1) * Vectors.norm(vec2)))

def recommend_papers(title, N):
    # Find the paper
    target_paper = df.filter(df.title == title).select("features").collect()[0].features
    # Calculate similarity
    similarity_udf = udf(lambda x: cosine_similarity(x, target_paper), FloatType())
    df_with_similarity = df.withColumn("similarity", similarity_udf(col("features")))
    # Get top N similar papers
    recommendations = df_with_similarity.orderBy(col("similarity").desc()).select("title").limit(N)
    return recommendations

# Example usage:
recommendations = recommend_papers("Example Title", 5)
recommendations.show()
