In [None]:
# Example: Linear Regression with Spark MLlib
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

# Initialize Spark Session
spark = SparkSession.builder.appName('MLlib Example').getOrCreate()

# Load sample data
data = [(1, 5.0, 20.0), (2, 10.0, 25.0), (3, 15.0, 30.0), (4, 20.0, 35.0)]
columns = ['ID', 'Feature', 'Target']
df = spark.createDataFrame(data, columns)

# Prepare data for modeling
assembler = VectorAssembler(inputCols=['Feature'], outputCol='Features')
df_transformed = assembler.transform(df)

# Train a linear regression model
lr = LinearRegression(featuresCol='Features', labelCol='Target')
model = lr.fit(df_transformed)

# Print model coefficients
print(f'Coefficients: {model.coefficients}')
print(f'Intercept: {model.intercept}')


Coefficients: [0.9999999999999992]
Intercept: 15.000000000000009


In [None]:
# Practice: Logistic Regression
from pyspark.ml.classification import LogisticRegression

# Example dataset
data = [(1, [2.0, 3.0], 0), (2, [1.0, 5.0], 1), (3, [2.5, 4.5], 1), (4, [3.0, 6.0], 0)]
columns = ['ID', 'Features', 'Label']
df = spark.createDataFrame(data, columns)

# Train logistic regression model
lr = LogisticRegression(featuresCol='Features', labelCol='Label')
model = lr.fit(df)

# Display coefficients and summary
print(f'Coefficients: {model.coefficients}')
print(f'Intercept: {model.intercept}')


IllegalArgumentException: requirement failed: Column Features must be of type class org.apache.spark.ml.linalg.VectorUDT:struct<type:tinyint,size:int,indices:array<int>,values:array<double>> but was actually class org.apache.spark.sql.types.ArrayType:array<double>.

In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf

# Inisialisasi Spark session
spark = SparkSession.builder.appName("LogisticRegressionExample").getOrCreate()

# Dataset contoh
data = [
    (1, [2.0, 3.0], 0),
    (2, [1.0, 5.0], 1),
    (3, [2.5, 4.5], 1),
    (4, [3.0, 6.0], 0)
]
columns = ['ID', 'Features', 'Label']

# Membuat DataFrame
df = spark.createDataFrame(data, columns)

# Konversi kolom 'Features' menjadi DenseVector
vector_udf = udf(lambda x: Vectors.dense(x), VectorUDT())
df = df.withColumn("Features", vector_udf(df["Features"]))

# Melatih model logistic regression
lr = LogisticRegression(featuresCol='Features', labelCol='Label')
model = lr.fit(df)

# Menampilkan koefisien dan intercept
print(f'Koefisien: {model.coefficients}')
print(f'Intercept: {model.intercept}')


Koefisien: [-12.262057929180484,4.087352266486688]
Intercept: 11.56891272665312


In [None]:
# Practice: KMeans Clustering
from pyspark.ml.clustering import KMeans

# Example dataset
data = [(1, [1.0, 1.0]), (2, [5.0, 5.0]), (3, [10.0, 10.0]), (4, [15.0, 15.0])]
columns = ['ID', 'Features']
df = spark.createDataFrame(data, columns)

# Train KMeans clustering model
kmeans = KMeans(featuresCol='Features', k=2)
model = kmeans.fit(df)

# Show cluster centers
centers = model.clusterCenters()
print(f'Cluster Centers: {centers}')


IllegalArgumentException: requirement failed: Column Features must be of type equal to one of the following types: [struct<type:tinyint,size:int,indices:array<int>,values:array<double>>, array<double>, array<float>] but was actually of type array<double>.

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.ml.linalg import VectorUDT

# Inisialisasi Spark session
spark = SparkSession.builder.appName("KMeansClusteringExample").getOrCreate()

# Dataset contoh
data = [
    (1, [1.0, 1.0]),
    (2, [5.0, 5.0]),
    (3, [10.0, 10.0])
    (4, [15.0, 15.0])
]
columns = ['ID', 'Features']

# Membuat DataFrame
df = spark.createDataFrame(data, columns)

# Konversi kolom 'Features' menjadi DenseVector
vector_udf = udf(lambda x: Vectors.dense(x), VectorUDT())
df = df.withColumn("Features", vector_udf(df["Features"]))

# Melatih model KMeans
kmeans = KMeans(featuresCol='Features', k=2)  # Jumlah cluster = 2
model = kmeans.fit(df)

# Menampilkan pusat cluster
centers = model.clusterCenters()
print(f'Cluster Centers: {centers}')


Cluster Centers: [array([12.5, 12.5]), array([3., 3.])]


In [None]:
!pip install kaggle



In [None]:
from google.colab import files
files.upload()  # Upload kaggle.json


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"pawpawpawpaw","key":"ecc1e236a659736abd2f9f3cd83bb7e9"}'}

In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content"


In [None]:
!kaggle datasets download -d diaaessam/constellation-names


Dataset URL: https://www.kaggle.com/datasets/diaaessam/constellation-names
License(s): other
Downloading constellation-names.zip to /content
  0% 0.00/1.93k [00:00<?, ?B/s]
100% 1.93k/1.93k [00:00<00:00, 2.78MB/s]


In [None]:
!unzip constellation-names.zip


Archive:  constellation-names.zip
  inflating: Constellation Names.csv  


In [None]:
from pyspark.sql import SparkSession

# Inisialisasi Spark session
spark = SparkSession.builder.appName("ConstellationDataset").getOrCreate()

# Memuat dataset dari file CSV
file_path = "Constellation Names.csv"  # Ganti dengan path file yang sesuai
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Menampilkan struktur dan data awal
df.printSchema()
df.show(5)


root
 |-- Name: string (nullable = true)
 |-- Abbreviation: string (nullable = true)
 |-- Genitive: string (nullable = true)
 |-- Meaning: string (nullable = true)
 |-- Brightest Star: string (nullable = true)

+---------+------------+----------+--------------------+--------------+
|     Name|Abbreviation|  Genitive|             Meaning|Brightest Star|
+---------+------------+----------+--------------------+--------------+
|Andromeda|         And|Andromedae|Princess of Ethiopia|     Alpheratz|
|   Antlia|         Ant|   Antilae|            Air Pump| Alpha Antilae|
|     Apus|         Aps|    Apodis|    Bird of Paradise|  Alpha Apodis|
| Aquarius|         Aqr|   Aquarii|        Water-bearer|     Sadalsuud|
|   Aquila|         Aql|   Aquilae|           The Eagle|        Altair|
+---------+------------+----------+--------------------+--------------+
only showing top 5 rows



In [None]:
# Menampilkan jumlah baris
print(f"Jumlah baris: {df.count()}")

# Melihat nama kolom
print(f"Kolom: {df.columns}")

# Menampilkan statistik deskriptif
df.describe().show()


Jumlah baris: 88
Kolom: ['Name', 'Abbreviation', 'Genitive', 'Meaning', 'Brightest Star']
+-------+---------+------------+----------+--------+--------------+
|summary|     Name|Abbreviation|  Genitive| Meaning|Brightest Star|
+-------+---------+------------+----------+--------+--------------+
|  count|       88|          88|        88|      88|            88|
|   mean|     NULL|        NULL|      NULL|    NULL|          NULL|
| stddev|     NULL|        NULL|      NULL|    NULL|          NULL|
|    min|Andromeda|         And|Andromedae|Air Pump|      Achernar|
|    max|Vulpecula|        qSer|Vulpeculae|    Wolf|Zubeneschamali|
+-------+---------+------------+----------+--------+--------------+



In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.sql import SparkSession

# Inisialisasi Spark session
spark = SparkSession.builder.appName("ConstellationProcessing").getOrCreate()

# Memuat dataset
file_path = "Constellation Names.csv"  # Ganti dengan path file yang sesuai
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Menampilkan kolom yang tersedia
print(f"Kolom yang tersedia: {df.columns}")

# Mengonversi kolom 'Abbreviation' menjadi fitur numerik
abbreviation_indexer = StringIndexer(inputCol="Abbreviation", outputCol="AbbreviationIndexed")
df = abbreviation_indexer.fit(df).transform(df)

# Menentukan fitur dan label
assembler = VectorAssembler(inputCols=["AbbreviationIndexed"], outputCol="Features")
df = assembler.transform(df)

# Mengonversi 'Brightest Star' menjadi label numerik
label_indexer = StringIndexer(inputCol="Brightest Star", outputCol="Label")
df = label_indexer.fit(df).transform(df)

# Menampilkan dataset yang sudah diproses
df.select("Features", "Label").show(5)


Kolom yang tersedia: ['Name', 'Abbreviation', 'Genitive', 'Meaning', 'Brightest Star']
+--------+-----+
|Features|Label|
+--------+-----+
|   [0.0]| 29.0|
|   [1.0]|  6.0|
|   [2.0]|  7.0|
|   [4.0]| 80.0|
|   [3.0]| 31.0|
+--------+-----+
only showing top 5 rows



In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Membagi data menjadi training dan testing
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# Inisialisasi model Logistic Regression
lr = LogisticRegression(featuresCol="Features", labelCol="Label")

# Melatih model
lr_model = lr.fit(train_data)

# Evaluasi model
predictions = lr_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="Label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Akurasi Model: {accuracy}")


Akurasi Model: 0.0


In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Membagi dataset
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# Inisialisasi model Logistic Regression
lr = LogisticRegression(featuresCol="Features", labelCol="Label")

# Membuat grid parameter
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.1, 1.0])  # Regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])  # ElasticNet mixing
             .build())

# Evaluator untuk cross-validation
evaluator = MulticlassClassificationEvaluator(labelCol="Label", predictionCol="prediction", metricName="accuracy")

# CrossValidator setup
crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)  # 5-fold cross-validation

# Melatih model dengan cross-validation
cv_model = crossval.fit(train_data)

# Model terbaik
best_model = cv_model.bestModel

# Evaluasi performa model terbaik
predictions = best_model.transform(test_data)
accuracy = evaluator.evaluate(predictions)
print(f"Akurasi Model Terbaik: {accuracy}")


Akurasi Model Terbaik: 0.0
