In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=94c4b2a64eebd15aa3b9a0b78599bc0138f3089541fd594cd21a823b595c9068
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [None]:
spark = SparkSession.builder.appName("ChemicalClassification").getOrCreate()

In [None]:
data = spark.read.csv("/content/drive/MyDrive/Colab Notebooks/ML/indian_liver_patient.csv", header=True, inferSchema=True)

In [None]:
data

DataFrame[Age: int, Gender: string, Total_Bilirubin: double, Direct_Bilirubin: double, Alkaline_Phosphotase: int, Alamine_Aminotransferase: int, Aspartate_Aminotransferase: int, Total_Protiens: double, Albumin: double, Albumin_and_Globulin_Ratio: double, Dataset: int]

In [None]:
data.show()

+---+------+---------------+----------------+--------------------+------------------------+--------------------------+--------------+-------+--------------------------+-------+
|Age|Gender|Total_Bilirubin|Direct_Bilirubin|Alkaline_Phosphotase|Alamine_Aminotransferase|Aspartate_Aminotransferase|Total_Protiens|Albumin|Albumin_and_Globulin_Ratio|Dataset|
+---+------+---------------+----------------+--------------------+------------------------+--------------------------+--------------+-------+--------------------------+-------+
| 65|Female|            0.7|             0.1|                 187|                      16|                        18|           6.8|    3.3|                       0.9|      1|
| 62|  Male|           10.9|             5.5|                 699|                      64|                       100|           7.5|    3.2|                      0.74|      1|
| 62|  Male|            7.3|             4.1|                 490|                      60|                        

In [None]:
data.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Total_Bilirubin: double (nullable = true)
 |-- Direct_Bilirubin: double (nullable = true)
 |-- Alkaline_Phosphotase: integer (nullable = true)
 |-- Alamine_Aminotransferase: integer (nullable = true)
 |-- Aspartate_Aminotransferase: integer (nullable = true)
 |-- Total_Protiens: double (nullable = true)
 |-- Albumin: double (nullable = true)
 |-- Albumin_and_Globulin_Ratio: double (nullable = true)
 |-- Dataset: integer (nullable = true)



In [None]:
data.count()

583

In [None]:
data.describe().show()

+-------+------------------+------+-----------------+------------------+--------------------+------------------------+--------------------------+------------------+-----------------+--------------------------+------------------+
|summary|               Age|Gender|  Total_Bilirubin|  Direct_Bilirubin|Alkaline_Phosphotase|Alamine_Aminotransferase|Aspartate_Aminotransferase|    Total_Protiens|          Albumin|Albumin_and_Globulin_Ratio|           Dataset|
+-------+------------------+------+-----------------+------------------+--------------------+------------------------+--------------------------+------------------+-----------------+--------------------------+------------------+
|  count|               583|   583|              583|               583|                 583|                     583|                       583|               583|              583|                       579|               583|
|   mean| 44.74614065180103|  null|3.298799313893652|1.4861063464837074|  290.576329

In [None]:
from pyspark.sql.functions import col, mean, isnan, when
from pyspark.sql.types import DoubleType

# Calculate the mean of each column with missing values
mean_values = data.select(*(mean(col(c)).alias(c) for c in data.columns if data.select(col(c)).filter(col(c).isNull()).count() > 0)).collect()[0].asDict()

mean_values

{'Albumin_and_Globulin_Ratio': 0.9470639032815201}

In [None]:
# Fill missing values with the corresponding mean
filled_data = data
for column, mean_value in mean_values.items():
    filled_data = filled_data.withColumn(column, when(col(column).isNull(), mean_value).otherwise(col(column)))

In [None]:
# Replace infinite values with null for all features
for feature in filled_data.columns:
    filled_data = filled_data.withColumn(feature, when(~isnan(col(feature)), col(feature)).otherwise(None))

In [None]:
filled_data.show()

+---+------+---------------+----------------+--------------------+------------------------+--------------------------+--------------+-------+--------------------------+-------+
|Age|Gender|Total_Bilirubin|Direct_Bilirubin|Alkaline_Phosphotase|Alamine_Aminotransferase|Aspartate_Aminotransferase|Total_Protiens|Albumin|Albumin_and_Globulin_Ratio|Dataset|
+---+------+---------------+----------------+--------------------+------------------------+--------------------------+--------------+-------+--------------------------+-------+
| 65|Female|            0.7|             0.1|                 187|                      16|                        18|           6.8|    3.3|                       0.9|      1|
| 62|  Male|           10.9|             5.5|                 699|                      64|                       100|           7.5|    3.2|                      0.74|      1|
| 62|  Male|            7.3|             4.1|                 490|                      60|                        

In [None]:
# Convert the "Gender" column to numerical using StringIndexer
gender_indexer = StringIndexer(inputCol="Gender", outputCol="GenderIndex")
filled_data = gender_indexer.fit(filled_data).transform(filled_data)
filled_data.show()

+---+------+---------------+----------------+--------------------+------------------------+--------------------------+--------------+-------+--------------------------+-------+-----------+
|Age|Gender|Total_Bilirubin|Direct_Bilirubin|Alkaline_Phosphotase|Alamine_Aminotransferase|Aspartate_Aminotransferase|Total_Protiens|Albumin|Albumin_and_Globulin_Ratio|Dataset|GenderIndex|
+---+------+---------------+----------------+--------------------+------------------------+--------------------------+--------------+-------+--------------------------+-------+-----------+
| 65|Female|            0.7|             0.1|                 187|                      16|                        18|           6.8|    3.3|                       0.9|      1|        1.0|
| 62|  Male|           10.9|             5.5|                 699|                      64|                       100|           7.5|    3.2|                      0.74|      1|        0.0|
| 62|  Male|            7.3|             4.1|          

In [None]:
# Assemble the features into a vector
feature_columns = ["Age", "Total_Bilirubin", "Direct_Bilirubin", "Alkaline_Phosphotase",
                   "Alamine_Aminotransferase", "Aspartate_Aminotransferase", "Total_Protiens",
                   "Albumin", "Albumin_and_Globulin_Ratio", "GenderIndex"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(filled_data)

data.show()

+---+------+---------------+----------------+--------------------+------------------------+--------------------------+--------------+-------+--------------------------+-------+-----------+--------------------+
|Age|Gender|Total_Bilirubin|Direct_Bilirubin|Alkaline_Phosphotase|Alamine_Aminotransferase|Aspartate_Aminotransferase|Total_Protiens|Albumin|Albumin_and_Globulin_Ratio|Dataset|GenderIndex|            features|
+---+------+---------------+----------------+--------------------+------------------------+--------------------------+--------------+-------+--------------------------+-------+-----------+--------------------+
| 65|Female|            0.7|             0.1|                 187|                      16|                        18|           6.8|    3.3|                       0.9|      1|        1.0|[65.0,0.7,0.1,187...|
| 62|  Male|           10.9|             5.5|                 699|                      64|                       100|           7.5|    3.2|                   

In [None]:
# Select only the necessary columns for model training
data = data.select("features", "Dataset")
data.show()

+--------------------+-------+
|            features|Dataset|
+--------------------+-------+
|[65.0,0.7,0.1,187...|      1|
|[62.0,10.9,5.5,69...|      1|
|[62.0,7.3,4.1,490...|      1|
|[58.0,1.0,0.4,182...|      1|
|[72.0,3.9,2.0,195...|      1|
|[46.0,1.8,0.7,208...|      1|
|[26.0,0.9,0.2,154...|      1|
|[29.0,0.9,0.3,202...|      1|
|[17.0,0.9,0.3,202...|      2|
|[55.0,0.7,0.2,290...|      1|
|[57.0,0.6,0.1,210...|      1|
|[72.0,2.7,1.3,260...|      1|
|[64.0,0.9,0.3,310...|      2|
|[74.0,1.1,0.4,214...|      1|
|[61.0,0.7,0.2,145...|      1|
|[25.0,0.6,0.1,183...|      2|
|[38.0,1.8,0.8,342...|      1|
|[33.0,1.6,0.5,165...|      2|
|[40.0,0.9,0.3,293...|      1|
|[40.0,0.9,0.3,293...|      1|
+--------------------+-------+
only showing top 20 rows



In [None]:
# Split the data into train and test sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

In [None]:
# Create an instance of the LogisticRegression model
logreg = LogisticRegression(labelCol="Dataset")

In [None]:
# Fit the model to the train data
model = logreg.fit(train_data)

# Make predictions on the test data
predictions = model.transform(test_data)

# Show the predicted labels and corresponding features
predictions.select("prediction", "features").show()

+----------+--------------------+
|prediction|            features|
+----------+--------------------+
|       2.0|[6.0,0.6,0.1,289....|
|       1.0|[10.0,0.8,0.1,395...|
|       1.0|[12.0,0.8,0.2,302...|
|       1.0|[13.0,1.5,0.5,575...|
|       1.0|[16.0,7.7,4.1,268...|
|       2.0|[17.0,0.9,0.2,279...|
|       2.0|[18.0,0.8,0.2,228...|
|       2.0|[18.0,1.8,0.7,178...|
|       2.0|[21.0,1.0,0.3,142...|
|       1.0|[21.0,3.9,1.8,150...|
|       1.0|[21.0,18.5,9.5,38...|
|       2.0|[22.0,0.8,0.2,198...|
|       2.0|[22.0,0.9,0.3,179...|
|       1.0|[22.0,2.7,1.0,160...|
|       2.0|[24.0,0.9,0.2,195...|
|       2.0|[25.0,0.9,0.3,159...|
|       1.0|[26.0,1.7,0.6,210...|
|       2.0|[29.0,0.8,0.2,156...|
|       1.0|[30.0,1.3,0.4,482...|
|       1.0|[31.0,0.6,0.1,175...|
+----------+--------------------+
only showing top 20 rows

