In [None]:
# Import Spark NLP'
from sparknlp.base import *
from sparknlp.annotator import Tokenizer as NLPTokenizer
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, Tokenizer, HashingTF, IDF, PolynomialExpansion, StopWordsRemover
from pyspark.sql.functions import *
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark import SparkContext
from pyspark.sql import functions as F , SparkSession
# Assemble the pipeline
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.sql.types import FloatType, StringType
from pyspark.sql.functions import col, when, regexp_extract

# Initialize Spark session with optimized configurations
spark = SparkSession.builder \
    .master("local[*]") \
    .config("spark.driver.memory", "12g") \
    .config("spark.executor.memory", "12g") \
    .config("spark.executor.memoryOverhead", "1g") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:InitiatingHeapOccupancyPercent=35 -XX:ConcGCThreads=4") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.default.parallelism", "4") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.instances", "2") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0") \
    .config("spark.sql.optimizer.dynamicPartitionPruning.enabled", "true") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

df = spark.read.format('CSV').options(header= True,
                            delimiter = ",",
                            quote = '"',
                            escape = '"',
                            inferSchema = 'false',
                            encoding = 'UTF8',
                            multiline = True,
                            rootTag = '',
                            rowTag = '',
                            attributePrefix = ''
                            ).load("cleaned2.csv")

df = df.withColumn("Impact", df["Impact"].cast(FloatType()))
# df.write.parquet("cleaned1.parquet")
# df = spark.read.parquet('cleaned1.parquet')
# Define the regular expression to capture a four-digit year
year_pattern = r"(\d{4})"

# Update the publishedDate column: if a year is found, keep it; otherwise, set to None
df = df.withColumn(
    "publishedDate",
    when(
        col("publishedDate").rlike(year_pattern),
        regexp_extract(col("publishedDate"), year_pattern, 1)  # Extract the year if present
    )  # Set to None if no valid year is found
)
df = df.drop('_c0')
df.printSchema()

root
 |-- Title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- publishedDate: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- Impact: float (nullable = true)



In [None]:
max_value = df.agg({"Impact": "max"}).collect()[0][0] + 1
df = df.withColumn("Impact", F.log(max_value - F.col("Impact")))

In [None]:
# Calculate null counts for all columns, handling both NaN and null values
null_counts = df.select([
    F.count(F.when(F.col(c).isNull() | (F.col(c) == "")|
                   (F.col(c).cast("string") == "NaN"), c)).alias(c)
    for c in df.columns
])

# Show null counts for verification
null_counts.show()

+-----+-----------+-------+---------+-------------+----------+------+
|Title|description|authors|publisher|publishedDate|categories|Impact|
+-----+-----------+-------+---------+-------------+----------+------+
|    0|      10952|    456|        0|           35|         0|     0|
+-----+-----------+-------+---------+-------------+----------+------+



# Fill Null

In [None]:
df = df.fillna({"Title": "No Title", "description": "No Description", "authors":"Unknown author"})

In [None]:
# Define the publishedDate column reference once to avoid repeated lookups
published_date_col = F.col("publishedDate")

# Create the date_format column with optimized condition checks
df = df.withColumn(
    "date_format",
    F.when(published_date_col.rlike(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}[+-]\d{2}:\d{2}$"), "T-Timestamp")
    .when(published_date_col.rlike(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$"), "Z-Timestamp")
    .when(published_date_col.rlike(r"^\d{4}-\d{2}-\d{2}$"), "yyyy-dd-mm")
    .when(published_date_col.rlike(r"^\d{4}-\d{2}$"), "yyyy-mm")
    .when(published_date_col.rlike(r"^\d{4}$"), "yyyy")
    .when(published_date_col.rlike(r"^\d{4}\*$"), "yyyy*")
    .otherwise(None)
)

# Perform the grouping and counting
df.groupBy("date_format").count().show()

+-----------+------+
|date_format| count|
+-----------+------+
|       NULL|    35|
|       yyyy|138689|
+-----------+------+



# Date Feature Engineering

In [None]:
df = df.withColumn("publishedYear", year(col("publishedDate")))

In [None]:
median_year = df.approxQuantile("publishedYear", [0.5], 0.01)[0]
# Replace NULLs with the median year`
df = df.fillna({"publishedYear": str(int(median_year))})

In [None]:
# Compute book age using current year
current_year = 2024
df = df.withColumn("book_age", lit(current_year) - col("publishedYear"))

# Create decade and century features
df = df.withColumn("published_decade", (col("publishedYear") / 10).cast("int") * 10)
df = df.withColumn("published_century", (col("publishedYear") / 100).cast("int") + 1)

# Era categorization
df = df.withColumn("published_era",
                   F.when(col("publishedYear") >= 2000, "Modern")
                   .when((col("publishedYear") >= 1900) & (col("publishedYear") < 2000), "Contemporary")
                   .when((col("publishedYear") >= 1800) & (col("publishedYear") < 1900), "Classic")
                   .otherwise("Ancient"))

In [None]:
columns_to_drop = ["date_format","publishedDate"]

# Drop columns from the DataFrame
df = df.drop(*columns_to_drop)

# Tf-IDF for title and Description

In [None]:
from pyspark.mllib.feature import HashingTF, IDF
# Step 1: Apply HashingTF and IDF on Title column
# Convert DataFrame column to RDD for HashingTF
title_rdd = df.select("Title").rdd.flatMap(lambda row: [row[0]])

# Initialize HashingTF with a specific number of features
hashingTF = HashingTF(numFeatures=20)
title_tf = hashingTF.transform(title_rdd)

# Cache the transformed RDD to speed up IDF computation
title_tf.cache()

# Apply IDF on the term frequency vectors
idf_title = IDF().fit(title_tf)
title_tfidf = idf_title.transform(title_tf)

# Step 2: Apply HashingTF and IDF on description column
# Convert description column to RDD
desc_rdd = df.select("description").rdd.flatMap(lambda row: [row[0]])

# Transform description data
desc_tf = hashingTF.transform(desc_rdd)
desc_tf.cache()

# Compute IDF and transform the term frequencies for description
idf_desc = IDF().fit(desc_tf)
desc_tfidf = idf_desc.transform(desc_tf)

# Step 3: Convert RDDs back to DataFrames and join them with the original DataFrame
# Convert title TF-IDF to DataFrame
title_tfidf_df = title_tfidf.zipWithIndex().map(lambda x: (x[1], x[0])).toDF(["id", "title_tfidf"])

# Convert description TF-IDF to DataFrame
desc_tfidf_df = desc_tfidf.zipWithIndex().map(lambda x: (x[1], x[0])).toDF(["id", "desc_tfidf"])

# Add an ID column to the original DataFrame to join on
df = df.withColumn("id", F.monotonically_increasing_id())

# Join the original DataFrame with TF-IDF results
df = df.join(title_tfidf_df, "id").join(desc_tfidf_df, "id").drop("id")

In [None]:
df.printSchema()

root
 |-- Title: string (nullable = false)
 |-- description: string (nullable = false)
 |-- authors: string (nullable = false)
 |-- publisher: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- Impact: double (nullable = true)
 |-- publishedYear: integer (nullable = true)
 |-- book_age: integer (nullable = true)
 |-- published_decade: integer (nullable = true)
 |-- published_century: integer (nullable = true)
 |-- published_era: string (nullable = false)
 |-- title_tfidf: vector (nullable = true)
 |-- desc_tfidf: vector (nullable = true)



# Encode Published Era

In [None]:
# String indexer for era feature
indexer = StringIndexer(inputCol="published_era", outputCol="published_era_index")
df = indexer.fit(df).transform(df)

# One-hot encoding for indexed era feature
encoder = OneHotEncoder(inputCol="published_era_index", outputCol="published_era_encoded")
df = encoder.fit(df).transform(df)

# Feature Engineering for authors and publishers columns

In [None]:

# Frequency encoding for `authors` and `publisher` in a single operation
author_publisher_counts = df.groupBy("authors", "publisher").agg(
    F.count("authors").alias("author_frequency"),
    F.count("publisher").alias("publisher_frequency")
)

# Join frequency columns to the original DataFrame
df = df.join(author_publisher_counts, on=["authors", "publisher"], how="left")

# Document Assemblers for Authors and Publishers
document_assembler_authors = DocumentAssembler().setInputCol("authors").setOutputCol("authors_document")

document_assembler_publishers = DocumentAssembler() \
    .setInputCol("publisher") \
    .setOutputCol("publishers_document") \
    .setCleanupMode("shrink")

# BERT Embeddings for Authors and Publishers
bert_embeddings_authors = BertSentenceEmbeddings.pretrained("sent_small_bert_L10_128", "en") \
    .setInputCols(["authors_document"]) \
    .setOutputCol("authors_embedding")

bert_embeddings_publishers = BertSentenceEmbeddings.pretrained("sent_small_bert_L10_128", "en") \
    .setInputCols(["publishers_document"]) \
    .setOutputCol("publishers_embedding")

# Define Pipeline for Document Assembly and BERT Embeddings
embedding_pipeline = Pipeline(stages=[
    document_assembler_authors,
    document_assembler_publishers,
    bert_embeddings_authors,
    bert_embeddings_publishers
])

# Apply the Pipeline to obtain embeddings
df = embedding_pipeline.fit(df).transform(df)
df = df.withColumn(
    "author_publisher_combined_embedding",
    F.concat(F.col("authors_embedding.embeddings"), F.col("publishers_embedding.embeddings"))
)

sent_small_bert_L10_128 download started this may take some time.
Approximate size to download 21.9 MB
[OK!]
sent_small_bert_L10_128 download started this may take some time.
Approximate size to download 21.9 MB
[OK!]


In [None]:
columns = ["authors","publisher"]
df.drop(*columns)

DataFrame[Title: string, description: string, categories: string, Impact: double, publishedYear: int, book_age: int, published_decade: int, published_century: int, published_era: string, title_tfidf: vector, desc_tfidf: vector, published_era_index: double, published_era_encoded: vector, author_frequency: bigint, publisher_frequency: bigint, authors_document: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, publishers_document: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, authors_embedding: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, publishers_embedding: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, author_publisher_combined_embedding: array<array<float>>]

# Encode Categories

In [None]:
from pyspark.ml.feature import FeatureHasher

# Using FeatureHasher for categories
hasher = FeatureHasher(inputCols=["categories"], outputCol="Category_Index", numFeatures=20)
df = hasher.transform(df)

# Sentiment Analysis for Title and Description

In [None]:

# Document Assemblers for Title and Description
document_title = DocumentAssembler() \
    .setInputCol("Title") \
    .setOutputCol("title_document")

document_desc = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("desc_document")

# Tokenizers for Title and Description
token_title = NLPTokenizer() \
    .setInputCols(["title_document"]) \
    .setOutputCol("title_token")

token_desc = NLPTokenizer() \
    .setInputCols(["desc_document"]) \
    .setOutputCol("desc_token")

# Normalizers for Title and Description
normalizer_title = Normalizer() \
    .setInputCols(["title_token"]) \
    .setOutputCol("title_normal")

normalizer_desc = Normalizer() \
    .setInputCols(["desc_token"]) \
    .setOutputCol("desc_normal")

# Vivekn Sentiment Models for Title and Description
vivekn_title = ViveknSentimentModel.pretrained() \
    .setInputCols(["title_document", "title_normal"]) \
    .setOutputCol("t_sentiment")

vivekn_desc = ViveknSentimentModel.pretrained() \
    .setInputCols(["desc_document", "desc_normal"]) \
    .setOutputCol("d_sentiment")

# Finishers to Extract Final Sentiment for Title and Description
finisher_title = Finisher() \
    .setInputCols(["t_sentiment"]) \
    .setOutputCols(["title_sentiment"])

finisher_desc = Finisher() \
    .setInputCols(["d_sentiment"]) \
    .setOutputCols(["description_sentiment"])

# Assemble the Sentiment Analysis Pipeline
sentiment_pipeline = Pipeline().setStages([
    document_title, token_title, normalizer_title, vivekn_title, finisher_title,
    document_desc, token_desc, normalizer_desc, vivekn_desc, finisher_desc
])

# Fit and Transform the DataFrame for Sentiment Analysis
df = sentiment_pipeline.fit(df).transform(df)

# Extract the first sentiment value from the array (assuming only one sentiment per array)
df = df.withColumn("title_sentiment_value", F.element_at("title_sentiment", 1))
df = df.withColumn("description_sentiment_value", F.element_at("description_sentiment", 1))

# Assign values directly based on sentiment without StringIndexer
df = df.withColumn("title_sentiment_encoded",
                   F.when(F.col("title_sentiment_value") == "positive", 1)
                   .when(F.col("title_sentiment_value") == "negative", 2)
                   .otherwise(0))

df = df.withColumn("description_sentiment_encoded",
                   F.when(F.col("description_sentiment_value") == "positive", 1)
                   .when(F.col("description_sentiment_value") == "negative", 2)
                   .otherwise(0))

# Drop intermediate columns if desired
df = df.drop("title_sentiment", "description_sentiment", "title_sentiment_value", "description_sentiment_value")


sentiment_vivekn download started this may take some time.
Approximate size to download 873.6 KB
[OK!]
sentiment_vivekn download started this may take some time.
Approximate size to download 873.6 KB
[OK!]


In [None]:
df.columns

['authors',
 'publisher',
 'Title',
 'description',
 'categories',
 'Impact',
 'publishedYear',
 'book_age',
 'published_decade',
 'published_century',
 'published_era',
 'title_tfidf',
 'desc_tfidf',
 'published_era_index',
 'published_era_encoded',
 'author_frequency',
 'publisher_frequency',
 'author_publisher_combined_embedding',
 'Category_Index',
 'title_sentiment_encoded',
 'description_sentiment_encoded']

In [None]:
# Define the columns to drop based on analysis of their redundancy and relevance
columns_to_drop = [
    # Original categorical/text columns
    "authors", "publisher", "categories",
]

In [None]:
df.printSchema()

root
 |-- authors: string (nullable = false)
 |-- publisher: string (nullable = true)
 |-- Title: string (nullable = false)
 |-- description: string (nullable = false)
 |-- categories: string (nullable = true)
 |-- Impact: double (nullable = true)
 |-- publishedYear: integer (nullable = true)
 |-- book_age: integer (nullable = true)
 |-- published_decade: integer (nullable = true)
 |-- published_century: integer (nullable = true)
 |-- published_era: string (nullable = false)
 |-- title_tfidf: vector (nullable = true)
 |-- desc_tfidf: vector (nullable = true)
 |-- published_era_index: double (nullable = false)
 |-- published_era_encoded: vector (nullable = true)
 |-- author_frequency: long (nullable = true)
 |-- publisher_frequency: long (nullable = true)
 |-- author_publisher_combined_embedding: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: float (containsNull = false)
 |-- Category_Index: vector (nullable = true)
 |-- title_sentiment_en

In [None]:
def array_to_dense_vector(array):
    return Vectors.dense(array) if array else Vectors.dense([])
array_to_vector_udf = F.udf(array_to_dense_vector, VectorUDT())
df = df.withColumn("author_publisher_combined_embedding", array_to_vector_udf(F.flatten("author_publisher_combined_embedding")))

In [None]:
columns_to_drop = [
    "authors",
    "publisher",
    "published_era",
    "categories"
]

# Drop the columns
df_cleaned = df.drop(*columns_to_drop)
df.printSchema()

root
 |-- authors: string (nullable = false)
 |-- publisher: string (nullable = true)
 |-- Title: string (nullable = false)
 |-- description: string (nullable = false)
 |-- categories: string (nullable = true)
 |-- Impact: double (nullable = true)
 |-- publishedYear: integer (nullable = true)
 |-- book_age: integer (nullable = true)
 |-- published_decade: integer (nullable = true)
 |-- published_century: integer (nullable = true)
 |-- published_era: string (nullable = false)
 |-- title_tfidf: vector (nullable = true)
 |-- desc_tfidf: vector (nullable = true)
 |-- published_era_index: double (nullable = false)
 |-- published_era_encoded: vector (nullable = true)
 |-- author_frequency: long (nullable = true)
 |-- publisher_frequency: long (nullable = true)
 |-- author_publisher_combined_embedding: vector (nullable = true)
 |-- Category_Index: vector (nullable = true)
 |-- title_sentiment_encoded: integer (nullable = false)
 |-- description_sentiment_encoded: integer (nullable = false)



In [None]:
from pyspark.ml.linalg import Vectors, DenseVector
from pyspark.ml.functions import vector_to_array
# Convert SparseVector to DenseVector for TF-IDF columns
# Convert SparseVector to DenseVector for TF-IDF columns
dense_vector_udf = F.udf(lambda v: Vectors.dense(v.toArray()) if v is not None else None, VectorUDT())

df = df.withColumn("title_tfidf_dense", dense_vector_udf(F.col("title_tfidf")))
df = df.withColumn("desc_tfidf_dense", dense_vector_udf(F.col("desc_tfidf")))


In [None]:
df = df.drop("title_tfidf", "desc_tfidf")
df.printSchema()

root
 |-- authors: string (nullable = false)
 |-- publisher: string (nullable = true)
 |-- Title: string (nullable = false)
 |-- description: string (nullable = false)
 |-- categories: string (nullable = true)
 |-- Impact: double (nullable = true)
 |-- publishedYear: integer (nullable = true)
 |-- book_age: integer (nullable = true)
 |-- published_decade: integer (nullable = true)
 |-- published_century: integer (nullable = true)
 |-- published_era: string (nullable = false)
 |-- published_era_index: double (nullable = false)
 |-- published_era_encoded: vector (nullable = true)
 |-- author_frequency: long (nullable = true)
 |-- publisher_frequency: long (nullable = true)
 |-- author_publisher_combined_embedding: vector (nullable = true)
 |-- Category_Index: vector (nullable = true)
 |-- title_sentiment_encoded: integer (nullable = false)
 |-- description_sentiment_encoded: integer (nullable = false)
 |-- title_tfidf_dense: vector (nullable = true)
 |-- desc_tfidf_dense: vector (nullabl

In [None]:
# Define final feature columns
feature_columns = [
    "publishedYear",
    "book_age",
    "published_decade",
    "published_century",
    "author_frequency",
    "publisher_frequency",
    "published_era_encoded",
    "Category_Index",
    "title_sentiment_encoded",
    "description_sentiment_encoded",
    "title_tfidf_dense",
    "desc_tfidf_dense",
    "author_publisher_combined_embedding"
]

# Filter out any columns that do not exist in the DataFrame
available_feature_columns = [col for col in feature_columns if col in df.columns]

# # Assemble all available features into a single vector column
assembler = VectorAssembler(inputCols=available_feature_columns, outputCol="assembled_features")
df_assembled = assembler.transform(df)
# df_assembled.write.parquet("df_assembled.parquet")
# Verify the schema and data types
df_assembled.printSchema()

root
 |-- authors: string (nullable = false)
 |-- publisher: string (nullable = true)
 |-- Title: string (nullable = false)
 |-- description: string (nullable = false)
 |-- categories: string (nullable = true)
 |-- Impact: double (nullable = true)
 |-- publishedYear: integer (nullable = true)
 |-- book_age: integer (nullable = true)
 |-- published_decade: integer (nullable = true)
 |-- published_century: integer (nullable = true)
 |-- published_era: string (nullable = false)
 |-- published_era_index: double (nullable = false)
 |-- published_era_encoded: vector (nullable = true)
 |-- author_frequency: long (nullable = true)
 |-- publisher_frequency: long (nullable = true)
 |-- author_publisher_combined_embedding: vector (nullable = true)
 |-- Category_Index: vector (nullable = true)
 |-- title_sentiment_encoded: integer (nullable = false)
 |-- description_sentiment_encoded: integer (nullable = false)
 |-- title_tfidf_dense: vector (nullable = true)
 |-- desc_tfidf_dense: vector (nullabl