In [None]:
!pip install pyspark



In [None]:
# Example: Linear Regression with Spark MLlib
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

# Initialize Spark Session
spark = SparkSession.builder.appName('MLlib Example').getOrCreate()

# Load sample data
data = [(1, 5.0, 20.0), (2, 10.0, 25.0), (3, 15.0, 30.0), (4, 20.0, 35.0)]
columns = ['ID', 'Feature', 'Target']
df = spark.createDataFrame(data, columns)

# Prepare data for modeling
assembler = VectorAssembler(inputCols=['Feature'], outputCol='Features')
df_transformed = assembler.transform(df)

# Train a linear regression model
lr = LinearRegression(featuresCol='Features', labelCol='Target')
model = lr.fit(df_transformed)

# Print model coefficients
print(f'Coefficients: {model.coefficients}')
print(f'Intercept: {model.intercept}')


Coefficients: [0.9999999999999992]
Intercept: 15.000000000000009


In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression

# Create SparkSession
spark = SparkSession.builder.appName("LogisticRegressionExample").getOrCreate()

# Example dataset
data = [
    (1, Vectors.dense([2.0, 3.0]), 0),
    (2, Vectors.dense([1.0, 5.0]), 1),
    (3, Vectors.dense([2.5, 4.5]), 1),
    (4, Vectors.dense([3.0, 6.0]), 0)
]
columns = ['ID', 'Features', 'Label']

df = spark.createDataFrame(data, columns)

# Train logistic regression model
lr = LogisticRegression(featuresCol='Features', labelCol='Label')
model = lr.fit(df)

# Display coefficients and summary
print(f'Coefficients: {model.coefficients}')
print(f'Intercept: {model.intercept}')


Coefficients: [-12.262057929180484,4.087352266486688]
Intercept: 11.56891272665312


In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder.appName("KMeansClusteringExample").getOrCreate()

# Example dataset with DenseVector
data = [
    (1, Vectors.dense([1.0, 1.0])),
    (2, Vectors.dense([5.0, 5.0])),
    (3, Vectors.dense([10.0, 10.0])),
    (4, Vectors.dense([15.0, 15.0]))
]
columns = ['ID', 'Features']
df = spark.createDataFrame(data, columns)

# Train KMeans clustering model
kmeans = KMeans(featuresCol='Features', k=2)
model = kmeans.fit(df)

# Show cluster centers
centers = model.clusterCenters()
print(f'Cluster Centers: {centers}')


Cluster Centers: [array([12.5, 12.5]), array([3., 3.])]


In [None]:
# Homework
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("Apple 2009-2024") \
    .getOrCreate()

# Load the dataset
data = spark.read.csv("Apple_Cleaned_2009_2024.csv", header=True, inferSchema=True)
data.show(5)

+----+-----------------+------------------+-----------------------+--------------------+---------------------+----+------------------+----------------+-----------------------+-----------------------+-------------------------+----------------------------+------------+--------+---------+----------+----------------------------+-----------------------------+----------------------------------+
|year|EBITDA (millions)|Revenue (millions)|Gross Profit (millions)|Op Income (millions)|Net Income (millions)| EPS|Shares Outstanding|Year Close Price|Total Assets (millions)|Cash on Hand (millions)|Long Term Debt (millions)|Total Liabilities (millions)|Gross Margin|PE ratio|Employees|Is Outlier|Normalized EBITDA (millions)|Normalized Revenue (millions)|Normalized Gross Profit (millions)|
+----+-----------------+------------------+-----------------------+--------------------+---------------------+----+------------------+----------------+-----------------------+-----------------------+-----------------

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline

spark = SparkSession.builder \
    .appName("Apple 2009-2024") \
    .getOrCreate()

data = spark.read.csv("Apple_Cleaned_2009_2024.csv", header=True, inferSchema=True)

data.show(5)

data = data.filter(data['Is Outlier'] == False)

# Mengonversi kolom 'Is Outlier' dari Boolean ke Integer
data = data.withColumn("Is Outlier", data["Is Outlier"].cast("integer"))

# Menyiapkan fitur dan label untuk model klasifikasi
# Misalnya, kita akan memprediksi 'Is Outlier' (0 jika bukan outlier, 1 jika outlier)
indexer = StringIndexer(inputCol="Is Outlier", outputCol="label")

feature_columns = [
    'EBITDA (millions)', 'Revenue (millions)', 'Gross Profit (millions)',
    'Op Income (millions)', 'Net Income (millions)', 'EPS',
    'Shares Outstanding', 'Total Assets (millions)',
    'Cash on Hand (millions)', 'Long Term Debt (millions)',
    'Total Liabilities (millions)', 'PE ratio', 'Employees'
]
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')

# Membangun pipeline
pipeline = Pipeline(stages=[indexer, assembler])
model_data = pipeline.fit(data).transform(data)
model_data.select("features", "label").show(5)

+----+-----------------+------------------+-----------------------+--------------------+---------------------+----+------------------+----------------+-----------------------+-----------------------+-------------------------+----------------------------+------------+--------+---------+----------+----------------------------+-----------------------------+----------------------------------+
|year|EBITDA (millions)|Revenue (millions)|Gross Profit (millions)|Op Income (millions)|Net Income (millions)| EPS|Shares Outstanding|Year Close Price|Total Assets (millions)|Cash on Hand (millions)|Long Term Debt (millions)|Total Liabilities (millions)|Gross Margin|PE ratio|Employees|Is Outlier|Normalized EBITDA (millions)|Normalized Revenue (millions)|Normalized Gross Profit (millions)|
+----+-----------------+------------------+-----------------------+--------------------+---------------------+----+------------------+----------------+-----------------------+-----------------------+-----------------

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

train_data, test_data = model_data.randomSplit([0.8, 0.2], seed=1234)
dt_classifier = DecisionTreeClassifier(featuresCol='features', labelCol='label')
dt_model = dt_classifier.fit(train_data)

# Melakukan prediksi
predictions = dt_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Accuracy: {accuracy:.2f}")

Accuracy: 1.00


In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Membangun grid untuk hyperparameter tuning
paramGrid = ParamGridBuilder() \
    .addGrid(dt_classifier.maxDepth, [2, 5, 10]) \
    .addGrid(dt_classifier.minInstancesPerNode, [1, 2, 5]) \
    .build()

# CrossValidator
crossval = CrossValidator(estimator=dt_classifier,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

cv_model = crossval.fit(train_data)
best_model = cv_model.bestModel
cv_predictions = best_model.transform(test_data)
cv_accuracy = evaluator.evaluate(cv_predictions)
print(f"Best Model Accuracy: {cv_accuracy:.2f}")

Best Model Accuracy: 1.00
