In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.mllib.linalg import Vectors as MLLibVectors

# Initialize Spark Session
spark = SparkSession.builder.master("local[*]").appName("LSA with SVD").getOrCreate()

# Sample data
data = [
    (0, "Spark is a unified analytics engine for big data processing"),
    (1, "Machine learning is a method of data analysis that automates analytical model building"),
    (2, "Deep learning models are built using artificial neural networks"),
    (3, "Data science is an inter-disciplinary field that uses scientific methods, processes, algorithms, and systems to extract knowledge from data")
]

# Create DataFrame
df = spark.createDataFrame(data, ["id", "text"])

# Tokenization
tokenizer = Tokenizer(inputCol="text", outputCol="words")
words_data = tokenizer.transform(df)

# Remove stopwords
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filtered_data = remover.transform(words_data)

# TF
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000)
featurized_data = hashingTF.transform(filtered_data)

# IDF
idf = IDF(inputCol="rawFeatures", outputCol="features")
idf_model = idf.fit(featurized_data)
rescaled_data = idf_model.transform(featurized_data)

# Convert to RowMatrix for SVD
rows_rdd = rescaled_data.select("features").rdd.map(lambda row: MLLibVectors.fromML(row['features']))
mat = RowMatrix(rows_rdd)

# Apply SVD
svd = mat.computeSVD(2, computeU=True)

# Collect U (document-topic matrix)
u_rows = svd.U.rows.zipWithIndex().map(lambda row: (row[1], row[0].toArray().tolist()))
u_df = spark.createDataFrame(u_rows, ["id", "svd_features"])

# Join back with original text
final_df = df.join(u_df, on="id")
final_df.select("id", "text", "svd_features").show(truncate=False)

# Convert results to HTML
html_output = """
<html><head><title>LSA Output</title></head><body>
<h2>LSA using SVD</h2>
<table border="1"><tr><th>ID</th><th>Text</th><th>SVD Features</th></tr>
"""
for row in final_df.collect():
    html_output += f"<tr><td>{row['id']}</td><td>{row['text']}</td><td>{row['svd_features']}</td></tr>\n"

html_output += "</table></body></html>"





+---+-------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------+
|id |text                                                                                                                                       |svd_features                                 |
+---+-------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------+
|0  |Spark is a unified analytics engine for big data processing                                                                                |[0.023219821730851027, 0.0024105356152050003]|
|1  |Machine learning is a method of data analysis that automates analytical model building                                                     |[0.031357203854802326, 0.15386572241412375]  |
|2  |Deep learning models are built usin