In [7]:
!pip install pyspark



In [77]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline


# diabetes.csv

In [78]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("DiabetesPrediction") \
    .getOrCreate()

In [79]:
# Load dataset
from google.colab import drive
drive.mount('/content/drive')
diabetes_df = spark.read.csv('/content/drive/My Drive/Colab Notebooks/Scalable and Distributed Computing/Lab/lab10/diabetes.csv', header=True, inferSchema=True)
# EDA
print("Diabetes Dataset Schema:")
diabetes_df.printSchema()
print("Diabetes Dataset Statistics:")
diabetes_df.describe().show()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Diabetes Dataset Schema:
root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)

Diabetes Dataset Statistics:
+-------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------------+------------------+------------------+
|summary|       Pregnancies|          Glucose|     BloodPressure|     SkinThickness|           Insulin|               BMI|DiabetesPedigreeFunction|               Age|           Outcome|
+-------+------------------+-----------------+----------

In [54]:
# Processing for Diabetes Dataset
diabetes_features = [col for col in diabetes_df.columns if col != 'Outcome']
assembler = VectorAssembler(inputCols=diabetes_features, outputCol="features")
indexer = StringIndexer(inputCol="Outcome", outputCol="label")

# Build and train model for Diabetes dataset
lr = LogisticRegression(featuresCol="features", labelCol="label")
pipeline = Pipeline(stages=[indexer, assembler, lr])

# Split data into training and test sets
diabetes_train, diabetes_test = diabetes_df.randomSplit([0.8, 0.2], seed=42)

# Train model
diabetes_model = pipeline.fit(diabetes_train)

# Evaluate model
diabetes_predictions = diabetes_model.transform(diabetes_test)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(diabetes_predictions)
print(f"Diabetes Model Accuracy: {accuracy}")


Diabetes Model Accuracy: 0.7804878048780488


In [80]:
# Assemble features into a single vector column
feature_cols = diabetes_df.columns[:-1]  # Exclude the last column which is the target
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_assembled = assembler.transform(diabetes_df)

# Select relevant columns (features and label)
df_final = df_assembled.select("features", "Outcome")


In [81]:
# Split data into train and test sets
(train_data, test_data) = df_final.randomSplit([0.8, 0.2], seed=42)


In [82]:
# Define RandomForestClassifier
rf = RandomForestClassifier(featuresCol="features", labelCol="Outcome", seed=42)

# Fit the model
model = rf.fit(train_data)


In [84]:
# Make predictions on test data
predictions = model.transform(test_data)

# Evaluate the model using areaUnderROC
evaluator = BinaryClassificationEvaluator(labelCol="Outcome")
areaUnderROC = evaluator.evaluate(predictions)
print(f"Area Under ROC on test data: {areaUnderROC}")

# Alternatively, evaluate using areaUnderPR (Precision-Recall curve)
evaluator.setMetricName("areaUnderPR")
areaUnderPR = evaluator.evaluate(predictions)
print(f"Area Under PR on test data: {areaUnderPR}")


Area Under ROC on test data: 0.8504273504273506
Area Under PR on test data: 0.7342594229334775


In [85]:
# Save the model
model.save("/content/drive/My Drive/Colab Notebooks/Scalable and Distributed Computing/Lab/lab10/diabetes_model")

# Load the model later
# loaded_model = RandomForestClassifier.load("/content/drive/My Drive/Colab Notebooks/Scalable and Distributed Computing/Lab/lab10/diabetes_model")


#housprice.csv

In [67]:
spark = SparkSession.builder \
    .appName("HousePricePrediction") \
    .getOrCreate()

In [68]:
houseprice_df = spark.read.csv('/content/drive/My Drive/Colab Notebooks/Scalable and Distributed Computing/Lab/lab10/housprice.csv', header=True, inferSchema=True)
print("House Price Dataset Schema:")
houseprice_df.printSchema()
print("House Price Dataset Statistics:")
houseprice_df.describe().show()

House Price Dataset Schema:
root
 |-- Id: integer (nullable = true)
 |-- MSSubClass: integer (nullable = true)
 |-- MSZoning: string (nullable = true)
 |-- LotFrontage: string (nullable = true)
 |-- LotArea: integer (nullable = true)
 |-- Street: string (nullable = true)
 |-- Alley: string (nullable = true)
 |-- LotShape: string (nullable = true)
 |-- LandContour: string (nullable = true)
 |-- Utilities: string (nullable = true)
 |-- LotConfig: string (nullable = true)
 |-- LandSlope: string (nullable = true)
 |-- Neighborhood: string (nullable = true)
 |-- Condition1: string (nullable = true)
 |-- Condition2: string (nullable = true)
 |-- BldgType: string (nullable = true)
 |-- HouseStyle: string (nullable = true)
 |-- OverallQual: integer (nullable = true)
 |-- OverallCond: integer (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- YearRemodAdd: integer (nullable = true)
 |-- RoofStyle: string (nullable = true)
 |-- RoofMatl: string (nullable = true)
 |-- Exterior1st: s

In [69]:
# Handle missing values, replace with zeros for numeric columns
numeric_cols = [col for col, dtype in houseprice_df.dtypes if dtype in ['int', 'double']]
houseprice_df = houseprice_df.fillna(0, subset=numeric_cols)

# String Indexing and One-Hot Encoding
cat_cols = [col for col, dtype in houseprice_df.dtypes if dtype == 'string']

indexers = [StringIndexer(inputCol=col, outputCol=col+"_index", handleInvalid="keep") for col in cat_cols]
encoder = [OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol=indexer.getOutputCol()+"_encoded") for indexer in indexers]

pipeline = Pipeline(stages=indexers + encoder)
houseprice_df = pipeline.fit(houseprice_df).transform(houseprice_df)


In [70]:
# Vector Assembling
# Assemble features into a single vector column
feature_cols = numeric_cols + [indexer.getOutputCol()+"_encoded" for indexer in indexers]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
houseprice_df = assembler.transform(houseprice_df)

# Standard Scaling
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
scaler_model = scaler.fit(houseprice_df)
houseprice_df = scaler_model.transform(houseprice_df)

In [71]:
# Split data into train and test sets
(train_data, test_data) = houseprice_df.randomSplit([0.8, 0.2], seed=42)


In [72]:
# Define and Train ML Model
rf = RandomForestRegressor(featuresCol="features", labelCol="SalePrice")

# Fit the model
model = rf.fit(train_data)


In [73]:
#Evaluate Model
# Make predictions on test data
predictions = model.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="SalePrice", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data: {rmse}")
# Save the model
model.save("/content/drive/My Drive/Colab Notebooks/Scalable and Distributed Computing/Lab/lab10/house_price_prediction_model")

# Load the model later if needed
# loaded_model = RandomForestRegressor.load("house_price_prediction_model")


Root Mean Squared Error (RMSE) on test data: 16813.14575634512


In [86]:
# Stop Spark session
spark.stop()