In [1]:
from pyspark.sql import SparkSession

# Spark session & context
spark = SparkSession.builder.master("local").getOrCreate()
sc = spark.sparkContext

# Test PySpark
spark.range(5).show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [13]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder \
    .appName("KMeansExample") \
    .getOrCreate()

# # Load your dataset as a DataFrame
data = spark.read.format("csv").option("header", "true").load("dataset/train_preprocessed.csv")

# # Assuming your data has features in columns "feature1", "feature2", ..., "featureN"
# # You need to assemble these features into a single vector column for KMeans
feature_cols = [ 'Age',
        'Annual_Income',
        'Monthly_Inhand_Salary', 
        'Num_Bank_Accounts',
        'Num_Credit_Card',
        'Interest_Rate',
        'Num_of_Loan',
        'Delay_from_due_date',
        'Num_of_Delayed_Payment',
        'Changed_Credit_Limit',
        'Num_Credit_Inquiries',
        'Outstanding_Debt',
        'Credit_Utilization_Ratio',
        'Total_EMI_per_month',
        'Amount_invested_monthly',
        'Monthly_Balance']

# Define the target data type
target_data_type = "float"

# Iterate through string columns and cast them to the target data type
for column in feature_cols:
    data = data.withColumn(column, col(column).cast(target_data_type))
    
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data)

# Print DataFrame schema
print("DataFrame Schema:")
data.printSchema()

# # Print a sample of DataFrame contents
# print("Sample of DataFrame contents:")
# data.show(5)

# # Trains a KMeans model
kmeans = KMeans().setK(5).setSeed(1)  # Set the number of clusters (k) and seed for reproducibility
model = kmeans.fit(data)

# # Make predictions
predictions = model.transform(data)

# # Shows the result
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

# Stop Spark session
spark.stop()


DataFrame Schema:
root
 |-- Age: float (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Annual_Income: float (nullable = true)
 |-- Monthly_Inhand_Salary: float (nullable = true)
 |-- Num_Bank_Accounts: float (nullable = true)
 |-- Num_Credit_Card: float (nullable = true)
 |-- Interest_Rate: float (nullable = true)
 |-- Num_of_Loan: float (nullable = true)
 |-- Delay_from_due_date: float (nullable = true)
 |-- Num_of_Delayed_Payment: float (nullable = true)
 |-- Changed_Credit_Limit: float (nullable = true)
 |-- Num_Credit_Inquiries: float (nullable = true)
 |-- Credit_Mix: string (nullable = true)
 |-- Outstanding_Debt: float (nullable = true)
 |-- Credit_Utilization_Ratio: float (nullable = true)
 |-- Payment_of_Min_Amount: string (nullable = true)
 |-- Total_EMI_per_month: float (nullable = true)
 |-- Amount_invested_monthly: float (nullable = true)
 |-- Payment_Behaviour: string (nullable = true)
 |-- Monthly_Balance: float (nullable = true)
 |-- Credit_Score: strin