In [None]:
from pyspark.sql.functions import concat_ws

# Concatenate title and abstract
df = df.withColumn('combined_text', concat_ws(' ', df.title, df.abstract))


In [None]:
from pyspark.ml.feature import Tokenizer, CountVectorizer, IDF
from pyspark.ml.feature import VectorAssembler

# Tokenization
tokenizer = Tokenizer(inputCol="combined_text", outputCol="tokens")
df_tokens = tokenizer.transform(df)

# CountVectorizer
cv = CountVectorizer(inputCol="tokens", outputCol="raw_features")
cv_model = cv.fit(df_tokens)
df_cv = cv_model.transform(df_tokens)

# IDF
idf = IDF(inputCol="raw_features", outputCol="features")
idf_model = idf.fit(df_cv)
df_idf = idf_model.transform(df_cv)


In [None]:
from pyspark.ml.linalg import DenseVector, SparseVector
from pyspark.sql.functions import col, udf
from pyspark.sql.types import VectorUDT

# Define UDF for converting sparse to dense vectors
def to_dense(v):
    if isinstance(v, SparseVector):
        return DenseVector(v.toArray())
    elif isinstance(v, DenseVector):
        return v
    else:
        raise TypeError("Unsupported vector type")

to_dense_udf = udf(to_dense, VectorUDT())

# Apply UDF to convert features to dense vectors
df_dense = df_idf.withColumn('features_dense', to_dense_udf(col('features')))


In [None]:
from pyspark.ml.feature import PCA

# Perform PCA
pca = PCA(k=20, inputCol="features_dense", outputCol="pca_features")
pca_model = pca.fit(df_dense)
df_pca = pca_model.transform(df_dense)


In [None]:
from pyspark.ml.clustering import KMeans
import matplotlib.pyplot as plt

# Prepare data for KMeans
df_pca = df_pca.withColumn("pca_features_dense", to_dense_udf("pca_features"))

# Elbow method to determine the best k
costs = []
k_values = range(2, 21)

for k in k_values:
    kmeans = KMeans(k=k, seed=1, featuresCol="pca_features_dense")
    try:
        model = kmeans.fit(df_pca)
        costs.append(model.summary.trainingCost)
    except Exception as e:
        print(f"Error fitting KMeans with k={k}: {e}")
        break

# Plot the Elbow Method results
if costs:
    plt.plot(k_values[:len(costs)], costs, marker='o')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Cost')
    plt.title('Elbow Method for Optimal k')
    plt.show()
else:
    print("No valid KMeans models were trained.")


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, CountVectorizer, IDF, VectorAssembler, PCA
from pyspark.sql.types import StructType, StructField, StringType

# Create a SparkSession
spark = SparkSession.builder.appName("TextFeatureExtraction").getOrCreate()

# Sample Data
data = [("This movie was absolutely fantastic!",),
        ("I found this film to be quite disappointing.",),
        ("The acting was superb, but the plot was weak.",)]

# Define the schema explicitly
schema = StructType([StructField("review", StringType(), True)])  

# Create DataFrame with the schema
df = spark.createDataFrame(data, schema=schema)  

# 1. Tokenization ...making index of each word
tokenizer = Tokenizer(inputCol="review", outputCol="words")
words_df = tokenizer.transform(df)
# words_df.show()

# 2. CountVectorizer (Term Frequency) ...How many time appears in the sentence
cv = CountVectorizer(inputCol="words", outputCol="rawFeatures")
model = cv.fit(words_df)
featurized_df = model.transform(words_df)
# featurized_df.show()

# 3. IDF (Inverse Document Frequency)  IDF(t)=log(1+DF(t)N​) percentage of one sentence appear in one words
idf = IDF(inputCol="rawFeatures", outputCol="features")
idf_model = idf.fit(featurized_df)
rescaled_df = idf_model.transform(featurized_df)
# rescaled_df.show()

# 4. VectorAssembler (Optional)
assembler = VectorAssembler(inputCols=["features"], outputCol="assembledFeatures")
assembled_df = assembler.transform(rescaled_df)

# Show assembled features
assembled_df.select("review", "assembledFeatures").show(truncate=False)

# 5. Apply PCA to reduce dimensions to 2
pca = PCA(k=2, inputCol="assembledFeatures", outputCol="pcaFeatures")
pca_model = pca.fit(assembled_df)
pca_df = pca_model.transform(assembled_df)

# Show the result
pca_df.select("review", "pcaFeatures").show(truncate=False)

# Stop the Spark session
spark.stop()


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler

# Create a SparkSession
spark = SparkSession.builder.appName("VectorAssemblerExample").getOrCreate()

# Sample DataFrame
data = [
    (0, 1.0, 3.0, 5.0),
    (1, 2.0, 4.0, 6.0),
    (2, 3.0, 5.0, 7.0)
]

# Define the schema
schema = ["id", "feature1", "feature2", "feature3"]

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

# Create VectorAssembler
assembler = VectorAssembler(inputCols=["feature1", "feature2", "feature3"], outputCol="features")

# Transform the DataFrame
assembled_df = assembler.transform(df)

# Show the result
assembled_df.select("id", "features").show(truncate=False)


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import PCA, VectorAssembler
from pyspark.ml.linalg import Vectors

# Create a SparkSession
spark = SparkSession.builder.appName("PCAExample").getOrCreate()

# Sample data
data = [
    (0, Vectors.dense([2.5, 2.4])),
    (1, Vectors.dense([0.5, 0.7])),
    (2, Vectors.dense([2.2, 2.9])),
    (3, Vectors.dense([1.9, 2.2])),
    (4, Vectors.dense([3.1, 3.0])),
    (5, Vectors.dense([2.3, 2.7])),
    (6, Vectors.dense([2.0, 1.6])),
    (7, Vectors.dense([1.0, 1.1])),
    (8, Vectors.dense([1.5, 1.6])),
    (9, Vectors.dense([1.1, 0.9]))
]

# Create DataFrame
df = spark.createDataFrame(data, ["id", "features"])

# Apply PCA
pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(df)
result = model.transform(df)

# Show the result
result.select("id", "features", "pcaFeatures").show(truncate=False)

# Stop the Spark session
spark.stop()


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, CountVectorizer, IDF, VectorAssembler, PCA
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
import matplotlib.pyplot as plt

# Create a SparkSession
spark = SparkSession.builder.appName("TextClustering").getOrCreate()

# Sample Data
data = [("This movie was absolutely fantastic!",),
        ("I found this film to be quite disappointing.",),
        ("The acting was superb, but the plot was weak.",)]

# Define the schema explicitly
schema = StructType([StructField("review", StringType(), True)])  

# Create DataFrame with the schema
df = spark.createDataFrame(data, schema=schema)  

# 1. Tokenization
tokenizer = Tokenizer(inputCol="review", outputCol="words")
words_df = tokenizer.transform(df)

# 2. CountVectorizer (Term Frequency)
cv = CountVectorizer(inputCol="words", outputCol="rawFeatures")
model = cv.fit(words_df)
featurized_df = model.transform(words_df)

# 3. IDF (Inverse Document Frequency)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idf_model = idf.fit(featurized_df)
rescaled_df = idf_model.transform(featurized_df)

# 4. VectorAssembler
assembler = VectorAssembler(inputCols=["features"], outputCol="assembledFeatures")
assembled_df = assembler.transform(rescaled_df)

# 5. Apply PCA to keep 95% of the variance
pca = PCA(k=2, inputCol="assembledFeatures", outputCol="pcaFeatures")
pca_model = pca.fit(assembled_df)
pca_df = pca_model.transform(assembled_df)

# Show PCA-transformed features
pca_df.select("review", "pcaFeatures").show(truncate=False)

# Elbow Method to determine the optimal number of clusters
cost = []
for k in range(2, 10):
    kmeans = KMeans(k=k, seed=1, featuresCol='pcaFeatures')
    model = kmeans.fit(pca_df)
    predictions = model.transform(pca_df)
    evaluator = ClusteringEvaluator(featuresCol='pcaFeatures')
    cost.append(model.summary.trainingCost)

# Plot the cost to visualize the elbow
plt.figure(figsize=(8, 6))
plt.plot(range(2, 10), cost, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Cost')
plt.title('Elbow Method For Optimal k')
plt.show()

# Choose the optimal k (you can observe the elbow plot and decide)
optimal_k = 3

# Apply K-means clustering with the optimal k
kmeans = KMeans(k=optimal_k, seed=1, featuresCol='pcaFeatures')
kmeans_model = kmeans.fit(pca_df)
kmeans_predictions = kmeans_model.transform(pca_df)

# Show the result with cluster assignments
kmeans_predictions.select("review", "pcaFeatures", "prediction").show(truncate=False)

# Stop the Spark session
spark.stop()


In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, CountVectorizer, IDF, VectorAssembler, PCA
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import col, array, lit, expr
from pyspark.sql.types import DoubleType

# Create a SparkSession
spark = SparkSession.builder.appName("PaperRecommendation").getOrCreate()

# Sample Data
data = [("Paper A: This study is about machine learning techniques",),
        ("Paper B: A review on deep learning approaches",),
        ("Paper C: An introduction to neural networks",),
        ("Paper D: Advances in computer vision",),
        ("Paper E: A study on reinforcement learning",)]

# Define the schema explicitly
schema = StructType([StructField("title", StringType(), True)])  

# Create DataFrame with the schema
df = spark.createDataFrame(data, schema=schema)  

# 1. Tokenization
tokenizer = Tokenizer(inputCol="title", outputCol="words")
words_df = tokenizer.transform(df)

# 2. CountVectorizer (Term Frequency)
cv = CountVectorizer(inputCol="words", outputCol="rawFeatures")
model = cv.fit(words_df)
featurized_df = model.transform(words_df)

# 3. IDF (Inverse Document Frequency)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idf_model = idf.fit(featurized_df)
rescaled_df = idf_model.transform(featurized_df)

# 4. VectorAssembler
assembler = VectorAssembler(inputCols=["features"], outputCol="assembledFeatures")
assembled_df = assembler.transform(rescaled_df)

# 5. Apply PCA to keep 95% of the variance
pca = PCA(k=2, inputCol="assembledFeatures", outputCol="pcaFeatures")
pca_model = pca.fit(assembled_df)
pca_df = pca_model.transform(assembled_df)

# 6. Apply K-means clustering with optimal k
optimal_k = 2  # Assuming k=2 for simplicity
kmeans = KMeans(k=optimal_k, seed=1, featuresCol='pcaFeatures')
kmeans_model = kmeans.fit(pca_df)
kmeans_predictions = kmeans_model.transform(pca_df)

# Show the clusters
kmeans_predictions.select("title", "pcaFeatures", "prediction").show(truncate=False)

# Define a function to calculate cosine similarity using SQL expressions
def cosine_similarity(v1, v2):
    dot_product = sum([v1[i] * v2[i] for i in range(len(v1))])
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    return dot_product / (norm_v1 * norm_v2)

cosine_similarity_udf = udf(cosine_similarity, DoubleType())

# Recommender function
def recommend_papers(input_title, top_n):
    # Find the cluster of the input title
    input_paper = kmeans_predictions.filter(kmeans_predictions.title == input_title).select("pcaFeatures", "prediction").first()
    input_cluster = input_paper["prediction"]
    input_vector = input_paper["pcaFeatures"]

    # Filter papers in the same cluster
    cluster_papers = kmeans_predictions.filter(kmeans_predictions.prediction == input_cluster).select("title", "pcaFeatures")

    # Compute cosine similarity with all papers in the same cluster
    similarities = cluster_papers.withColumn("similarity", expr(f"vector_dot(pcaFeatures, array({', '.join(map(str, input_vector.toArray()))})) / (vector_norm(pcaFeatures) * vector_norm(array({', '.join(map(str, input_vector.toArray()))})))"))

    # Get top N recommendations
    recommendations = similarities.orderBy(col("similarity").desc()).limit(top_n)
    
    return recommendations.select("title", "similarity").collect()

# Example usage: Recommend top 3 papers similar to "Paper A"
input_title = "Paper A: This study is about machine learning techniques"
top_n = 1
recommendations = recommend_papers(input_title, top_n)

# Show recommendations
for rec in recommendations:
    print(f"Title: {rec['title']}, Similarity: {rec['similarity']}")

# Stop the Spark session
spark.stop()


NameError: name 'StructType' is not defined

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, Word2Vec

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Word2VecExample") \
    .getOrCreate()

# Sample Data
data = [("This movie was absolutely fantastic!",),
        ("I found this film to be quite disappointing.",),
        ("The acting was superb, but the plot was weak.",)]

# Create a DataFrame
df = spark.createDataFrame(data, ["text"])

# Tokenize the text
tokenizer = Tokenizer(inputCol="text", outputCol="words")
df_tokenized = tokenizer.transform(df)

# Train a Word2Vec model
word2vec = Word2Vec(vectorSize=100, minCount=1, inputCol="words", outputCol="word2vec_features")
model = word2vec.fit(df_tokenized)
result = model.transform(df_tokenized)

# Given word
given_word = "movie"

# Find similar words
similar_words = model.findSynonyms(given_word, 5)  # Find top 5 similar words

# Convert DataFrame to Pandas DataFrame
similar_words_df = similar_words.toPandas()

# Display the similar words
for index, row in similar_words_df.iterrows():
    print(f"Similar word: {row['word']}, Similarity: {row['similarity']}")

# Stop SparkSession
spark.stop()


Similar word: superb,, Similarity: 0.19690561294555664
Similar word: fantastic!, Similarity: 0.0834183618426323
Similar word: film, Similarity: 0.07623938471078873
Similar word: disappointing., Similarity: 0.06888052821159363
Similar word: quite, Similarity: 0.06005062907934189


In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Word2Vec

# Start Spark session with increased timeout settings
spark = SparkSession.builder \
    .appName("Word2VecExample") \
    .config("spark.executor.heartbeatInterval", "100000s") \
    .config("spark.network.timeout", "10000000s") \
    .getOrCreate()

# Sample DataFrame with filtered words
df_filtered = spark.createDataFrame([
    (0, ["a", "b", "c"]),
    (1, ["d", "e", "f"]),
    (2, ["g", "h", "i"])
], ["id", "filtered_words"])

# Train Word2Vec model
# word2vec = Word2Vec(vectorSize=10, minCount=1, inputCol="filtered_words", outputCol="word2vec_features")


# Train a Word2Vec model
word2vec = Word2Vec(vectorSize=10, minCount=1, inputCol="filtered_words", outputCol="word2vec_features")
model = word2vec.fit(df_filtered)
df_vectorized = model.transform(df_filtered)

try:
    model = word2vec.fit(df_filtered)
    df_vectorized = model.transform(df_filtered)
    df_vectorized.show()
except Exception as e:
    print(f"Error: {e}")

# Stop the Spark session
spark.stop()


+---+--------------+--------------------+
| id|filtered_words|   word2vec_features|
+---+--------------+--------------------+
|  0|     [a, b, c]|[-0.0025148376201...|
|  1|     [d, e, f]|[0.00124437361955...|
|  2|     [g, h, i]|[-0.0368569927910...|
+---+--------------+--------------------+

