<a href="https://colab.research.google.com/github/EvelynChrisyla/2.DataAnalytics-With-Logistic-Regression-Classification-Using-Spark.ipynb/blob/main/DataAnalytics_(Logistic_Regression_Classification)_Using_Spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [169]:
!pip install pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import*
from pyspark.ml.feature import StandardScaler, VectorAssembler #for normalization, vector assembler to make array

spark = SparkSession.builder.getOrCreate()



In [170]:
#Import / Load Data from CSV
df_train= spark.read.option("inferSchema", "true").csv("Classification_Train.csv", header=True)
df_test = spark.read.option("inferSchema", "true").csv("Classification_Test.csv", header = True)

print("df_train")
df_train.show()

print("df_test")
df_test.show()

df_train
+-------------------+------+------+---------------+---------+-------+-------------+---------+
|               Name|Gender|Height|Education Level|Eye Color|Married|Salary Income|Depressed|
+-------------------+------+------+---------------+---------+-------+-------------+---------+
|      Sax Tesseyman|Female|   174|   Intermediate|     Blue|    Yes|     85000000|       No|
|        Niels Greet|  Male|   165|   Intermediate|    Black|     No|     14000000|       No|
|     Minetta Santry|Female|   160|            Low|    Black|     No|    148000000|      Yes|
|     Sherm Gossipin|Female|   144|           High|    Black|     No|     50000000|      Yes|
|   Cathie Blackmuir|  Male|   168|   Intermediate|    Black|    Yes|    101000000|       No|
|     Early Cardenas|  Male|   151|            Low|    Black|    Yes|    145000000|      Yes|
|   Willard Pendrick|Female|   141|   Intermediate|    Brown|     No|     55000000|      Yes|
|   Penelopa Spensly|Female|   144|   Intermediate|

In [171]:
#Select Important Data
df_train= df_train.select("Gender", "Education Level", "Married","Salary Income", "Depressed")

df_test= df_test.select("Gender", "Education Level", "Married","Salary Income", "Depressed")

df_train.show()
df_test.show()

+------+---------------+-------+-------------+---------+
|Gender|Education Level|Married|Salary Income|Depressed|
+------+---------------+-------+-------------+---------+
|Female|   Intermediate|    Yes|     85000000|       No|
|  Male|   Intermediate|     No|     14000000|       No|
|Female|            Low|     No|    148000000|      Yes|
|Female|           High|     No|     50000000|      Yes|
|  Male|   Intermediate|    Yes|    101000000|       No|
|  Male|            Low|    Yes|    145000000|      Yes|
|Female|   Intermediate|     No|     55000000|      Yes|
|Female|   Intermediate|    Yes|     51000000|       No|
|Female|           High|     No|     97000000|      Yes|
|  Male|            Low|     No|     41000000|      Yes|
|Female|           High|    Yes|     27000000|       No|
|  Male|           High|    Yes|      3000000|       No|
|Female|           High|     No|      9000000|      Yes|
|Female|   Intermediate|     No|     12000000|      Yes|
|  Male|   Intermediate|    Yes

# **DATA CLEANSING**

In [172]:
# If any data have NA
df_train = df_train.na.drop()
df_test = df_test.na.drop()

In [173]:
#Transform string to num/int

#df training
df_train = df_train.withColumn("Education Level", when(df_train["Education Level"] == "Low",0)
                                                  .when(df_train["Education Level"] == "Intermediate",1)
                                                  .when(df_train["Education Level"] == "High",2))

df_train = df_train.withColumn("Married", when(df_train["Married"] == "No",0)
                                          .when(df_train["Married"] == "Yes",1))

df_train = df_train.withColumn("Gender", when(df_train["Gender"] == "Male",0)
                                        .when(df_train["Gender"] == "Female",1))


df_train = df_train.withColumn("Depressed", when(df_train["Depressed"] == "No",0)
                                        .when(df_train["Depressed"] == "Yes",1))


#df testing
df_test = df_test.withColumn("Education Level", when(df_test["Education Level"] == "Low",0)
                                                  .when(df_test["Education Level"] == "Intermediate",1)
                                                  .when(df_test["Education Level"] == "High",2))

df_test = df_test.withColumn("Married", when(df_test["Married"] == "No",0)
                                          .when(df_test["Married"] == "Yes",1))

df_test = df_test.withColumn("Gender", when(df_test["Gender"] == "Male",0)
                                        .when(df_test["Gender"] == "Female",1))


df_test = df_test.withColumn("Depressed", when(df_test["Depressed"] == "No",0)
                                        .when(df_test["Depressed"] == "Yes",1))

df_train.show(20)
df_test.show(20)

+------+---------------+-------+-------------+---------+
|Gender|Education Level|Married|Salary Income|Depressed|
+------+---------------+-------+-------------+---------+
|     1|              1|      1|     85000000|        0|
|     0|              1|      0|     14000000|        0|
|     1|              0|      0|    148000000|        1|
|     1|              2|      0|     50000000|        1|
|     0|              1|      1|    101000000|        0|
|     0|              0|      1|    145000000|        1|
|     1|              1|      0|     55000000|        1|
|     1|              1|      1|     51000000|        0|
|     1|              2|      0|     97000000|        1|
|     0|              0|      0|     41000000|        1|
|     1|              2|      1|     27000000|        0|
|     0|              2|      1|      3000000|        0|
|     1|              2|      0|      9000000|        1|
|     1|              1|      0|     12000000|        1|
|     0|              1|      1

In [174]:
#Normalization

trainColumns = df_train.columns
trainColumns.remove("Depressed")
# print(trainColumns)

df_train = VectorAssembler(inputCols= trainColumns, outputCol ="Features").transform(df_train)
df_train = StandardScaler(inputCol="Features", outputCol ="NormalizedFeature").fit(df_train).transform(df_train)
df_train.show(5)


trainColumns = df_test.columns
trainColumns.remove("Depressed")

df_test = VectorAssembler(inputCols= trainColumns, outputCol ="Features").transform(df_test)
df_test = StandardScaler(inputCol="Features", outputCol ="NormalizedFeature").fit(df_test).transform(df_test)
df_test.show(5)

+------+---------------+-------+-------------+---------+--------------------+--------------------+
|Gender|Education Level|Married|Salary Income|Depressed|            Features|   NormalizedFeature|
+------+---------------+-------+-------------+---------+--------------------+--------------------+
|     1|              1|      1|     85000000|        0| [1.0,1.0,1.0,8.5E7]|[1.99995775711396...|
|     0|              1|      0|     14000000|        0| [0.0,1.0,0.0,1.4E7]|[0.0,1.2959574236...|
|     1|              0|      0|    148000000|        1|[1.0,0.0,0.0,1.48E8]|[1.99995775711396...|
|     1|              2|      0|     50000000|        1| [1.0,2.0,0.0,5.0E7]|[1.99995775711396...|
|     0|              1|      1|    101000000|        0|[0.0,1.0,1.0,1.01E8]|[0.0,1.2959574236...|
+------+---------------+-------+-------------+---------+--------------------+--------------------+
only showing top 5 rows

+------+---------------+-------+-------------+---------+--------------------+-------

In [175]:
df_train.select("NormalizedFeature").show(10,False)

+-----------------------------------------------------------------------------+
|NormalizedFeature                                                            |
+-----------------------------------------------------------------------------+
|[1.9999577571139613,1.2959574236207398,2.0004291810120303,2.0520111267842664]|
|[0.0,1.2959574236207398,0.0,0.3379783032350556]                              |
|[1.9999577571139613,0.0,0.0,3.5729134913420166]                              |
|[1.9999577571139613,2.5919148472414797,0.0,1.2070653686966273]               |
|[0.0,1.2959574236207398,2.0004291810120303,2.438272044767187]                |
|[0.0,0.0,2.0004291810120303,3.500489569220219]                               |
|[1.9999577571139613,1.2959574236207398,0.0,1.32777190556629]                 |
|[1.9999577571139613,1.2959574236207398,2.0004291810120303,1.2312066760705598]|
|[1.9999577571139613,2.5919148472414797,0.0,2.341706815271457]                |
|(4,[3],[0.9897936023312343])           

In [176]:
from pyspark.ml.classification import LogisticRegression #for prediction

In [177]:
#Create Model Classification
#(df_train) to identify patterns or relationships between these factors and depression
model = LogisticRegression(featuresCol="NormalizedFeature", labelCol="Depressed", maxIter =1000).fit(df_train)


predictResult = model.transform(df_test)

In [178]:
predictResult.select("Depressed","prediction").show(20)

+---------+----------+
|Depressed|prediction|
+---------+----------+
|        1|       1.0|
|        0|       0.0|
|        0|       0.0|
|        1|       1.0|
|        1|       1.0|
|        1|       1.0|
|        1|       1.0|
|        0|       0.0|
|        1|       1.0|
|        1|       1.0|
|        1|       0.0|
|        0|       0.0|
|        0|       1.0|
|        0|       0.0|
|        0|       0.0|
|        0|       1.0|
|        0|       0.0|
|        1|       1.0|
|        1|       1.0|
|        0|       0.0|
+---------+----------+
only showing top 20 rows



In [179]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator # to find accuracy

In [181]:
# #The predicted results are compared with the actual
# labels that have been recorded in the “Depressed” column
# to evaluate the model performance
evaluator = BinaryClassificationEvaluator(labelCol ="Depressed")
accuracy = evaluator.evaluate(predictResult)

print(f"Accuracy: {accuracy * 100}%")

Accuracy: 87.06413015098448%
