In [None]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=e26b9f109fd3073ec73c91139543aee05ce3e802758422d95f6782c64109d8df
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1
The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra fonts-nanum fonts-ipafont-gothic
  fonts-ipafont-mincho fonts-wqy-microhei fonts-wqy-zenhei fonts-indic

In [None]:
import pyspark
import pyspark.sql  as pyspark_sql
import pyspark.sql.types as pyspark_types
import pyspark.sql.functions  as F
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import row_number, desc



# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = pyspark_sql.SparkSession.builder.getOrCreate()

# Classification

In [None]:
# Create a list of data as tuples
data = [(-1, -1, 1), (-2, -1, 1), (-3, -2, 1), (1, 1, 2), (2, 1, 2), (3, 2, 2)]

# Create a DataFrame from the list of tuples
df = spark.createDataFrame(data, ["feature1", "feature2", "label"])

df.show()

+--------+--------+-----+
|feature1|feature2|label|
+--------+--------+-----+
|      -1|      -1|    1|
|      -2|      -1|    1|
|      -3|      -2|    1|
|       1|       1|    2|
|       2|       1|    2|
|       3|       2|    2|
+--------+--------+-----+



In [None]:
# Convert label (string) to integer for classification
from pyspark.ml.feature import StringIndexer
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(df)
df = labelIndexer.transform(df)

df.show()

+--------+--------+-----+------------+
|feature1|feature2|label|indexedLabel|
+--------+--------+-----+------------+
|      -1|      -1|    1|         0.0|
|      -2|      -1|    1|         0.0|
|      -3|      -2|    1|         0.0|
|       1|       1|    2|         1.0|
|       2|       1|    2|         1.0|
|       3|       2|    2|         1.0|
+--------+--------+-----+------------+



In [None]:
# Split the data into training and test sets
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=["feature1", "feature2"], outputCol="features")

df = assembler.transform(df)
(trainingData, testData) = df.randomSplit([0.5, 0.5])

df.show()

+--------+--------+-----+------------+-----------+
|feature1|feature2|label|indexedLabel|   features|
+--------+--------+-----+------------+-----------+
|      -1|      -1|    1|         0.0|[-1.0,-1.0]|
|      -2|      -1|    1|         0.0|[-2.0,-1.0]|
|      -3|      -2|    1|         0.0|[-3.0,-2.0]|
|       1|       1|    2|         1.0|  [1.0,1.0]|
|       2|       1|    2|         1.0|  [2.0,1.0]|
|       3|       2|    2|         1.0|  [3.0,2.0]|
+--------+--------+-----+------------+-----------+



In [None]:
# Create a DecisionTreeClassifier model
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier()

# Train the model on the training data
model = dt.fit(trainingData)

# Make predictions on the test data
predictions = model.transform(testData)

# Evaluate the model performance (optional)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print("Test Accuracy:", accuracy)

# Stop the SparkSession
spark.stop()

Test Accuracy: 0.5


# Regression

In [None]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("LinearRegressionExample").getOrCreate()


In [None]:
# y = 1 * x_0 + 2 * x_1 + 3

# Create a list of data as tuples
data = [(1, 1, 6), (1, 2, 8), (2, 2, 9), (2, 3, 11)]

In [None]:
# Create a DataFrame from the list of tuples with column names
df = spark.createDataFrame(data, ["x1", "x2", "y"])

df.show()

+---+---+---+
| x1| x2|  y|
+---+---+---+
|  1|  1|  6|
|  1|  2|  8|
|  2|  2|  9|
|  2|  3| 11|
+---+---+---+



In [None]:
# Assemble features into a single vector
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=["x1", "x2"], outputCol="features")
df = assembler.transform(df)

df = df.withColumnRenamed("y", "label")

df.show()

+---+---+-----+---------+
| x1| x2|label| features|
+---+---+-----+---------+
|  1|  1|    6|[1.0,1.0]|
|  1|  2|    8|[1.0,2.0]|
|  2|  2|    9|[2.0,2.0]|
|  2|  3|   11|[2.0,3.0]|
+---+---+-----+---------+



In [None]:
# Create a LinearRegression model
from pyspark.ml.regression import LinearRegression
lr = LinearRegression()

# Train the model on the data
model = lr.fit(df)


In [None]:
# Make predictions on new data
newData = spark.createDataFrame([(3, 3)], ["x1", "x2"])
newData = assembler.transform(newData)
predictions = model.transform(newData)
predictions.select("features", "prediction").show()

# Print the model coefficients and intercept
print("Coefficients:", model.coefficients)
print("Intercept:", model.intercept)

# Stop the SparkSession
spark.stop()

+---------+------------------+
| features|        prediction|
+---------+------------------+
|[3.0,3.0]|12.000000000000004|
+---------+------------------+

Coefficients: [1.0000000000000033,2.0000000000000018]
Intercept: 2.99999999999999


# Clustering

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

# Create a SparkSession
spark = SparkSession.builder.appName("KMeansExample").getOrCreate()

# Create a list of data points
data = [(1, 2), (1, 4), (1, 0), (10, 2), (10, 4), (10, 0)]

# Create a DataFrame
df = spark.createDataFrame(data, ["feature1", "feature2"])

# Assemble features into a single vector
assembler = VectorAssembler(inputCols=["feature1", "feature2"], outputCol="features")
df = assembler.transform(df)

In [None]:
# Create a KMeans model with 2 clusters
kmeans = KMeans(k=2)

# Train the model on the data
model = kmeans.fit(df)

# Make predictions on the data
predictions = model.transform(df)

# Print the predictions
predictions.show()

# Stop the SparkSession
spark.stop()

+--------+--------+----------+----------+
|feature1|feature2|  features|prediction|
+--------+--------+----------+----------+
|       1|       2| [1.0,2.0]|         0|
|       1|       4| [1.0,4.0]|         0|
|       1|       0| [1.0,0.0]|         0|
|      10|       2|[10.0,2.0]|         1|
|      10|       4|[10.0,4.0]|         1|
|      10|       0|[10.0,0.0]|         1|
+--------+--------+----------+----------+



# Association Rule Mining

In [None]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("LinearRegressionExample").getOrCreate()

In [None]:
from pyspark.ml.fpm import FPGrowth

df = spark.createDataFrame([
('Tom', ['Bread', 'Eggs', 'Milk']),
('Dick', ['Bread', 'Eggs', 'Butter', 'Milk']),
('Harry', ['Bread', 'Eggs'])
], ["name", "items"])

fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
model = fpGrowth.fit(df)
model.transform(df).show()

test_df = spark.createDataFrame([
('John', ['Bread', 'Eggs'])
], ["name", "items"])

model.transform(test_df).show()

+-----+--------------------+----------+
| name|               items|prediction|
+-----+--------------------+----------+
|  Tom| [Bread, Eggs, Milk]|        []|
| Dick|[Bread, Eggs, But...|        []|
|Harry|       [Bread, Eggs]|    [Milk]|
+-----+--------------------+----------+

+----+-------------+----------+
|name|        items|prediction|
+----+-------------+----------+
|John|[Bread, Eggs]|    [Milk]|
+----+-------------+----------+

