# 
# Machine Learning in End-to-End Big Data (Uçtan Uca Büyük Veride Makine Öğrenmesi)

In [1]:
import findspark as fs    

fs.init("C:\spark")   # Starting Spark 

In [2]:
import matplotlib.pyplot as splt     # Getting data visualization libraries
import seaborn as sns

In [3]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf          # Generating Spark Session to make process
from pyspark import SparkContext


spark = SparkSession.builder.master("local").appName("machine_learning_on_spark").config("spark.executer.memory","16gb").getOrCreate()

start_spark = spark.sparkContext
start_spark


# 

In [12]:
spark_df = spark.read.csv("churn.csv", header=True, inferSchema=True, sep = ",")   # Reading Dataset

spark_df.cache()

DataFrame[_c0: int, Names: string, Age: double, Total_Purchase: double, Account_Manager: int, Years: double, Num_Sites: double, Churn: int]

In [13]:
spark_df.printSchema()    # Types of variables

root
 |-- _c0: integer (nullable = true)
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Churn: integer (nullable = true)



In [14]:
spark_df.show()

+---+-------------------+----+--------------+---------------+-----+---------+-----+
|_c0|              Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|Churn|
+---+-------------------+----+--------------+---------------+-----+---------+-----+
|  0|   Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|    1|
|  1|      Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|    1|
|  2|        Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|    1|
|  3|      Phillip White|42.0|       8010.76|              0| 6.71|     10.0|    1|
|  4|     Cynthia Norton|37.0|       9191.58|              0| 5.56|      9.0|    1|
|  5|   Jessica Williams|48.0|      10356.02|              0| 5.12|      8.0|    1|
|  6|        Eric Butler|44.0|      11331.58|              1| 5.23|     11.0|    1|
|  7|      Zachary Walsh|32.0|       9885.12|              1| 6.92|      9.0|    1|
|  8|        Ashlee Carr|43.0|       14062.6|              1| 5.46|     11.0

# 

In [15]:
spark_df = spark_df.toDF(*[c.lower() for c in spark_df.columns])     # The process of converting variables to lowercase

In [16]:
spark_df.show(5)

+---+----------------+----+--------------+---------------+-----+---------+-----+
|_c0|           names| age|total_purchase|account_manager|years|num_sites|churn|
+---+----------------+----+--------------+---------------+-----+---------+-----+
|  0|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|    1|
|  1|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|    1|
|  2|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|    1|
|  3|   Phillip White|42.0|       8010.76|              0| 6.71|     10.0|    1|
|  4|  Cynthia Norton|37.0|       9191.58|              0| 5.56|      9.0|    1|
+---+----------------+----+--------------+---------------+-----+---------+-----+
only showing top 5 rows



In [17]:
# df.columns = map(str.lower, df.columns)   # Other way to make lowercase process

# 

In [18]:
spark_df = spark_df.withColumnRenamed("_c0","index")    # Rename variable name

spark_df.show(4)

+-----+----------------+----+--------------+---------------+-----+---------+-----+
|index|           names| age|total_purchase|account_manager|years|num_sites|churn|
+-----+----------------+----+--------------+---------------+-----+---------+-----+
|    0|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|    1|
|    1|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|    1|
|    2|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|    1|
|    3|   Phillip White|42.0|       8010.76|              0| 6.71|     10.0|    1|
+-----+----------------+----+--------------+---------------+-----+---------+-----+
only showing top 4 rows



In [19]:
spark_df.count()    # All data number in Spark dataset

900

In [20]:
spark_df.columns    # Variable name in Dataset

['index',
 'names',
 'age',
 'total_purchase',
 'account_manager',
 'years',
 'num_sites',
 'churn']

In [22]:
spark_df.distinct().count()     # Unique data count

900

In [23]:
spark_df.select("names").distinct().count()    # Unique data count in names column

899

In [25]:
spark_df.groupby("names").count().sort("count", ascending=False).show(5) 

# There are 2 Jennifers. We'll see if there are other people with the same name.

+----------------+-----+
|           names|count|
+----------------+-----+
|   Jennifer Wood|    2|
|   David Compton|    1|
|Patrick Robinson|    1|
|   Chelsea Marsh|    1|
|     John Barber|    1|
+----------------+-----+
only showing top 5 rows



In [26]:
spark_df.filter(spark_df.names == "Jennifer Wood").show()     # Two different people with the same name.

+-----+-------------+----+--------------+---------------+-----+---------+-----+
|index|        names| age|total_purchase|account_manager|years|num_sites|churn|
+-----+-------------+----+--------------+---------------+-----+---------+-----+
|   22|Jennifer Wood|35.0|       9381.12|              1| 6.78|     11.0|    1|
|  439|Jennifer Wood|48.0|      11585.16|              0| 4.61|      9.0|    0|
+-----+-------------+----+--------------+---------------+-----+---------+-----+

