In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("diabetes").getOrCreate()


In [None]:
df = spark.read.csv("diabetes.csv", header=True, inferSchema=True)
df.show(2)

In [None]:
df.printSchema()

In [None]:
df.describe().toPandas()

In [None]:
df.groupby('Outcome').count().show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
fig = plt.figure(figsize=(25, 15))
st = fig.suptitle("Distribution of Features", fontsize=50, verticalalignment="center")
for col, num in zip(df.toPandas().describe().columns, range(1,11)):
  ax = fig.add_subplot(3,4, num)
  ax.hist(df.toPandas()[col])
  plt.grid(False)
  plt.xticks(rotation=45, fontsize=20)
  plt.yticks(fontsize=15)
  plt.title(col.upper(), fontsize=20)

plt.tight_layout()
st.set_y(0.95)
fig.subplots_adjust(top=0.85, hspace=0.4)
plt.show()

In [None]:
from pyspark.sql.functions import isnan, when, count, col

In [None]:
df.select([count(when(isnan(c),c)).alias(c) for c in df.columns]).toPandas().head()

# User Defined Functions (UDF)


In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

In [None]:
y_udf = udf(lambda y: "no" if y == 0 else "yes", StringType())

In [None]:
df = df.withColumn("HasDiabetes",y_udf('Outcome')).drop("Outcome")


In [None]:
df.show(3)

In [None]:
def udf_multi(age):
  if (age < 25):
    return "Under 25"
  elif (age >=25 and age <= 35):
    return "Between 25 and 35"
  elif (age > 35 and age < 50):
    return "Between 36 and 50"
  elif (age >= 50):
    return "Over 50"
  else: return "NA"

In [None]:
age_udf = udf(udf_multi)
df = df.withColumn('age_udf', age_udf('Age'))

In [None]:
df.show(2)

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.functions import rank, sum, desc
from pyspark.sql import Window

In [None]:
window = Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)

In [None]:
age_group_tab = df.select(["age_udf","Glucose"]).\
                      groupBy('age_udf').\
                          agg(
                              F.count('Glucose').alias('UserCount'),
                              F.mean('Glucose').alias('Glucose_AVG'),
                              F.min('Glucose').alias('Glucose_MIN'),
                              F.max('Glucose').alias('Glucose_MAX')).\
                          withColumn('total', sum(col('UserCount')).over(window)).\
                          withColumn('Percent', col('UserCount')*100 / col('total')).\
                          drop(col('total')).sort(desc('Percent'))

In [None]:
age_group_tab.toPandas()

In [None]:

sns.barplot(x="age_udf", y="Percent",data=age_group_tab.toPandas());