In [1]:
# Initialize Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PlayStoreML").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/19 16:25:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/19 16:25:17 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
# Load dataset from DBFS or specify the path to the file
file_path = "/Users/anujabavkar/Downloads/Play Store Data.csv"
data = spark.read.csv(file_path, header=True, inferSchema=True)

# Show first few rows of data to understand structure
data.show(5)

                                                                                

+-------+---+--------------------+----+-----+------+-------------+--------+--------------+
|gender |age|                 app|type|price|rating|        genre|installs|content_rating|
+-------+---+--------------------+----+-----+------+-------------+--------+--------------+
| Female| 17|Photo Editor & Ca...|Free|    0|   4.1| Art & Design|   10000|           4.0|
|   Male| 29| Coloring book moana|Free|    0|   3.9|Art & Design |  500000|           4.0|
| Female| 69|U Launcher Lite –...|Free|    0|   4.7| Art & Design| 5000000|           4.0|
| Female| 19|Sketch - Draw & P...|Free|    0|   4.5| Art & Design|50000000|          12.0|
|   Male| 28|Pixel Draw - Numb...|Free|    0|   4.3|Art & Design |  100000|           4.0|
+-------+---+--------------------+----+-----+------+-------------+--------+--------------+
only showing top 5 rows



In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer

# Inspect column names
print("Columns:", data.columns)

# Encode categorical columns
if "type" in data.columns and "TypeIndex" not in data.columns:
    indexer = StringIndexer(inputCol="type", outputCol="TypeIndex")
    data = indexer.fit(data).transform(data)

if "content_rating" in data.columns and "ContentRatingIndex" not in data.columns:
    indexer = StringIndexer(inputCol="content_rating", outputCol="ContentRatingIndex")
    data = indexer.fit(data).transform(data)

# Convert numerical columns to appropriate formats if necessary
if "installs" in data.columns:
    data = data.withColumn("Installs", col("installs").cast("int"))
if "rating" in data.columns:
    data = data.withColumn("Rating", col("rating").cast("float"))
if "price" in data.columns:
    data = data.withColumn("Price", col("price").cast("float"))

# Show the first few rows of the transformed data
data.show(5)

Columns: ['gender ', 'age', 'app', 'type', 'price', 'rating', 'genre', 'installs', 'content_rating']


                                                                                

+-------+---+--------------------+----+-----+------+-------------+--------+--------------+---------+------------------+
|gender |age|                 app|type|Price|Rating|        genre|Installs|content_rating|TypeIndex|ContentRatingIndex|
+-------+---+--------------------+----+-----+------+-------------+--------+--------------+---------+------------------+
| Female| 17|Photo Editor & Ca...|Free|  0.0|   4.1| Art & Design|   10000|           4.0|      0.0|               0.0|
|   Male| 29| Coloring book moana|Free|  0.0|   3.9|Art & Design |  500000|           4.0|      0.0|               0.0|
| Female| 69|U Launcher Lite –...|Free|  0.0|   4.7| Art & Design| 5000000|           4.0|      0.0|               0.0|
| Female| 19|Sketch - Draw & P...|Free|  0.0|   4.5| Art & Design|50000000|          12.0|      0.0|               1.0|
|   Male| 28|Pixel Draw - Numb...|Free|  0.0|   4.3|Art & Design |  100000|           4.0|      0.0|               0.0|
+-------+---+--------------------+----+-

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Check and encode categorical columns
if "TypeIndex" not in data.columns:
    indexer = StringIndexer(inputCol="Type", outputCol="TypeIndex")
    data = indexer.fit(data).transform(data)

# Convert necessary columns to numeric types
data = data.withColumn("Rating", col("Rating").cast("double"))
data = data.withColumn("Installs", col("Installs").cast("double"))
data = data.withColumn("Price", col("Price").cast("double"))

# Drop rows with null values in essential columns
data = data.dropna(subset=["Rating", "Installs", "TypeIndex", "Price"])

# Assemble features for regression model
if 'features' in data.columns:
    data = data.drop('features')

assembler = VectorAssembler(inputCols=["Rating", "Installs", "TypeIndex"], outputCol="features")
data = assembler.transform(data)


In [6]:
# Split data into training and testing sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)

# Linear Regression Model
lr = LinearRegression(featuresCol="features", labelCol="Price")
lr_model = lr.fit(train_data)

# Make predictions and evaluate
predictions = lr_model.transform(test_data)
predictions.select("features", "Price", "prediction").show(5)

# Evaluate the model using RMSE
evaluator = RegressionEvaluator(labelCol="Price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) for Linear Regression: {rmse}")

24/11/19 16:29:31 WARN Instrumentation: [77520b07] regParam is zero, which might cause numerical instability and overfitting.
24/11/19 16:29:32 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/11/19 16:29:32 WARN Instrumentation: [77520b07] Mean and standard deviation of the label are zero, so the coefficients and the intercept will all be zero; as a result, training is not needed.


+--------------------+-----+----------+
|            features|Price|prediction|
+--------------------+-----+----------+
|[3.90000009536743...|  0.0|       0.0|
|[3.79999995231628...|  0.0|       0.0|
|[4.30000019073486...|  0.0|       0.0|
|[4.40000009536743...|  0.0|       0.0|
|[4.40000009536743...|  0.0|       0.0|
+--------------------+-----+----------+
only showing top 5 rows

Root Mean Squared Error (RMSE) for Linear Regression: 0.0


In [7]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Check if 'features' column exists and remove it if it does
if 'features' in data.columns:
    data = data.drop('features')

# Assemble features for logistic regression to predict 'TypeIndex'
assembler = VectorAssembler(inputCols=["Rating", "Installs", "Price"], outputCol="features")
data = assembler.transform(data)

# Split the data into training and testing sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)

# Train Logistic Regression Model
lr_class = LogisticRegression(featuresCol="features", labelCol="TypeIndex")
lr_class_model = lr_class.fit(train_data)

# Make predictions on the test data
predictions_class = lr_class_model.transform(test_data)
predictions_class.select("features", "TypeIndex", "prediction").show(5)

# Evaluate the model accuracy
evaluator_class = MulticlassClassificationEvaluator(labelCol="TypeIndex", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator_class.evaluate(predictions_class)
print(f"Accuracy for Logistic Regression Classification: {accuracy}")

24/11/19 16:29:40 WARN Instrumentation: [a096b468] All labels are the same value and fitIntercept=true, so the coefficients will be zeros. Training is not needed.


+--------------------+---------+----------+
|            features|TypeIndex|prediction|
+--------------------+---------+----------+
|[3.90000009536743...|      0.0|       0.0|
|[3.79999995231628...|      0.0|       0.0|
|[4.30000019073486...|      0.0|       0.0|
|[4.40000009536743...|      0.0|       0.0|
|[4.40000009536743...|      0.0|       0.0|
+--------------------+---------+----------+
only showing top 5 rows

Accuracy for Logistic Regression Classification: 1.0


In [8]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

# Check if 'features' column exists and remove it if it does
if 'features' in data.columns:
    data = data.drop('features')

# Assemble features for KMeans clustering
assembler = VectorAssembler(inputCols=["Rating", "Installs", "Price"], outputCol="features")
data = assembler.transform(data)

# Train KMeans Model with 3 clusters
kmeans = KMeans(featuresCol="features", k=3)
model = kmeans.fit(data)

# Make predictions and show clusters
predictions_cluster = model.transform(data)
predictions_cluster.select("features", "prediction").show(5)

# Display cluster centers
print("Cluster Centers:")
for center in model.clusterCenters():
  print(center)

[Stage 21:>                                                         (0 + 1) / 1]                                                                                

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[4.09999990463256...|         0|
|[3.90000009536743...|         0|
|[4.69999980926513...|         0|
|     [4.5,5.0E7,0.0]|         0|
|[4.30000019073486...|         0|
+--------------------+----------+
only showing top 5 rows

Cluster Centers:
[3.81317392e+00 9.96537614e+06 0.00000000e+00]
[4.26491233e+00 1.00000000e+09 0.00000000e+00]
[4.34507048e+00 5.00000000e+08 0.00000000e+00]


In [0]:
jhjh