<a href="https://colab.research.google.com/github/BrendaGilisho/Clustering-techniques-in-data-mining/blob/main/Clustering_Pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Installing pyspark
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Importing necessary packages
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.context import SparkContext 
from google.colab import files


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
##upload dataset
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [None]:
spark

In [None]:
data = spark.read.csv('/content/drive/My Drive/Colab Notebooks/Datamining/books - description (1) (1).csv', header=True)


In [None]:
data.show(4)

+--------------------+--------------------+--------------------+
|             book_id|                name|         description|
+--------------------+--------------------+--------------------+
|                4833|    The Glass Castle|A tender, moving ...|
|Jeannette Walls g...| they lived like ...| moving among Sou...|
|               Later| when the money r...| or the romance o...|
|What is so astoni...| but that she des...|   but also a tender|
+--------------------+--------------------+--------------------+
only showing top 4 rows



In [None]:
data.printSchema()

root
 |-- book_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)



**Data Preprocessing**

In [None]:
##cast id column to appropriate datatype
from pyspark.sql.functions import col

data_set = data.filter(col("book_id").cast("int").isNotNull())

In [None]:
from pyspark.sql.types import IntegerType
data_set = data_set.withColumn("book_id", data_set["book_id"].cast(IntegerType()))

In [None]:
data_set.show(20)

+-------+--------------------+--------------------+
|book_id|                name|         description|
+-------+--------------------+--------------------+
|   4833|    The Glass Castle|A tender, moving ...|
|    590|Night (The Night ...|Born into a Jewis...|
|   4264|Angela's Ashes (F...|Imbued on every p...|
|   3361|    Eat, Pray, Love |A celebrated writ...|
|   4535|Into Thin Air: A ...|"A bank of clouds...|
|    518|Tuesdays with Morrie|Maybe it was a gr...|
|   3781|Running with Scis...|The true story of...|
|   3190|       Into the Wild|In April 1992 a y...|
|   3147|I Know Why the Ca...|"Sent by their mo...|
|    225|"A Child Called "...|               #1) "|
|   3035|Chickens, Mules a...|Perhaps if Joe an...|
|     99|Persepolis: The S...|Wise, funny, and ...|
|   3594|The Autobiography...|"Through a life o...|
|   4971|The Hiding Place:...|At one time Corri...|
|   1381|All Creatures Gre...|Delve into the ma...|
|   3099|         Confessions|Augustine's Confe...|
|     45|Me 

In [None]:
data_set.printSchema()

root
 |-- book_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)



In [None]:
###handling missing values
# Calculate the number of rows in the DataFrame
num_rows = data_set.count()
num_rows

136

In [None]:
#show number of missing values in each column
# Sum the number of missing values in each column
missing_values = [data_set.filter(data_set[col].isNull()).count() for col in data_set.columns]

# Print the sum of missing values for each column
for col, missing in zip(data_set.columns, missing_values):
    print(f"{col}: {missing}")

book_id: 0
name: 0
description: 0


In [None]:
###normalize
##will not do it

In [None]:
##take care of missing values

In [None]:
##convert to numeric

In [None]:
from pyspark.ml.feature import VectorAssembler, StringIndexer

# Convert string columns to numerical values

indexer = StringIndexer(inputCol="name", outputCol="name_numeric")
dataset = indexer.fit(data_set).transform(data_set)

indexer = StringIndexer(inputCol="description", outputCol="description_numeric")
dataset = indexer.fit(data_set).transform(data_set)


In [None]:
dataset.show()

+-------+--------------------+--------------------+-------------------+
|book_id|                name|         description|description_numeric|
+-------+--------------------+--------------------+-------------------+
|   4833|    The Glass Castle|A tender, moving ...|               39.0|
|    590|Night (The Night ...|Born into a Jewis...|               52.0|
|   4264|Angela's Ashes (F...|Imbued on every p...|               67.0|
|   3361|    Eat, Pray, Love |A celebrated writ...|               34.0|
|   4535|Into Thin Air: A ...|"A bank of clouds...|               16.0|
|    518|Tuesdays with Morrie|Maybe it was a gr...|               78.0|
|   3781|Running with Scis...|The true story of...|               97.0|
|   3190|       Into the Wild|In April 1992 a y...|               68.0|
|   3147|I Know Why the Ca...|"Sent by their mo...|               24.0|
|    225|"A Child Called "...|               #1) "|               12.0|
|   3035|Chickens, Mules a...|Perhaps if Joe an...|             

In [None]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.sql import SparkSession


In [None]:
# Preprocess the data
assembler = VectorAssembler(inputCols=["name_onehot", "description_onehot"], outputCol="features")
df = assembler.transform(dataset)


In [None]:
df=df.drop('name')

In [None]:
df=df.drop('description')

In [None]:
df.show()

+-------+------------+-------------------+-----------------+------------------+--------------------+
|book_id|name_numeric|description_numeric|      name_onehot|description_onehot|            features|
+-------+------------+-------------------+-----------------+------------------+--------------------+
|   4833|       100.0|               30.0|(121,[100],[1.0])|  (121,[30],[1.0])|(242,[100,151],[1...|
|    590|        65.0|               43.0| (121,[65],[1.0])|  (121,[43],[1.0])|(242,[65,164],[1....|
|   4264|        10.0|               60.0| (121,[10],[1.0])|  (121,[60],[1.0])|(242,[10,181],[1....|
|   3361|        27.0|               25.0| (121,[27],[1.0])|  (121,[25],[1.0])|(242,[27,146],[1....|
|   4535|        45.0|                4.0| (121,[45],[1.0])|   (121,[4],[1.0])|(242,[45,125],[1....|
|    518|       115.0|               73.0|(121,[115],[1.0])|  (121,[73],[1.0])|(242,[115,194],[1...|
|   3781|        85.0|               95.0| (121,[85],[1.0])|  (121,[95],[1.0])|(242,[85,216

In [None]:
from pyspark.ml.clustering import KMeans

# Split the data into training and testing sets
training_data = df.limit(135)
test_data = df.subtract(training_data)


In [None]:
#define k-means model
# Train the KMeans model
kmeans = KMeans(k=10, seed=1)
model = kmeans.fit(training_data)

In [None]:
# Get the terms found in the first three clusters
cluster_centers = model.clusterCenters()
for i in range(3):
    print("Cluster ", i+1, ":")
    print(cluster_centers[i])

Cluster  1 :
[0.00884956 0.00884956 0.00884956 0.00884956 0.00884956 0.00884956
 0.00884956 0.00884956 0.00884956 0.00884956 0.00884956 0.00884956
 0.00884956 0.00884956 0.         0.00884956 0.00884956 0.00884956
 0.00884956 0.00884956 0.00884956 0.00884956 0.00884956 0.00884956
 0.00884956 0.00884956 0.00884956 0.00884956 0.00884956 0.00884956
 0.00884956 0.00884956 0.00884956 0.00884956 0.00884956 0.00884956
 0.         0.00884956 0.00884956 0.00884956 0.00884956 0.00884956
 0.         0.00884956 0.         0.00884956 0.00884956 0.00884956
 0.00884956 0.00884956 0.00884956 0.00884956 0.00884956 0.00884956
 0.00884956 0.00884956 0.00884956 0.00884956 0.00884956 0.00884956
 0.00884956 0.         0.00884956 0.00884956 0.00884956 0.00884956
 0.00884956 0.00884956 0.00884956 0.00884956 0.00884956 0.00884956
 0.00884956 0.00884956 0.00884956 0.00884956 0.00884956 0.00884956
 0.00884956 0.         0.00884956 0.         0.00884956 0.00884956
 0.00884956 0.         0.00884956 0.00884956 0.00