#### Importing Libraries

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

#### Creating SparkSession

In [3]:
spark = SparkSession.builder.appName("first").master("local[*]").getOrCreate()

In [6]:
schema = StructType([
  # Define a StructField for each field
  StructField('STATUS', StringType(), False),
  StructField('TECH', StringType(), False),
  StructField('DATE', StringType(), False)
])

data = spark.read.csv("./data/server_log.tsv", sep='\t', schema=schema)

In [10]:
data.show()

+------+--------+----------+
|STATUS|    TECH|      DATE|
+------+--------+----------+
|ERROR:|     php|21/05/2015|
| DONE:|     php|11/01/2016|
|ERROR:|RailsApp|05/08/2015|
|ERROR:|     php|19/05/2015|
| DONE:|   mysql|23/01/2016|
|ERROR:|     php|13/02/2016|
|ERROR:|     php|22/11/2014|
|ERROR:|RailsApp|25/12/2015|
|ERROR:|   mysql|18/03/2015|
| DONE:|     php|22/08/2015|
|ERROR:|RailsApp|06/05/2015|
|ERROR:|     php|09/03/2015|
| DONE:|   mysql|28/06/2015|
+------+--------+----------+



In [18]:
ERROR_DATA = data.filter(data.STATUS == "ERROR:")

In [19]:
ERROR_DATA.show()

+------+--------+----------+
|STATUS|    TECH|      DATE|
+------+--------+----------+
|ERROR:|     php|21/05/2015|
|ERROR:|RailsApp|05/08/2015|
|ERROR:|     php|19/05/2015|
|ERROR:|     php|13/02/2016|
|ERROR:|     php|22/11/2014|
|ERROR:|RailsApp|25/12/2015|
|ERROR:|   mysql|18/03/2015|
|ERROR:|RailsApp|06/05/2015|
|ERROR:|     php|09/03/2015|
+------+--------+----------+



In [26]:
ERRORS = ERROR_DATA.groupBy("TECH").count().withColumnRenamed("count", "No. of Errors")

In [27]:
ERRORS.show()

+--------+-------------+
|    TECH|No. of Errors|
+--------+-------------+
|   mysql|            1|
|RailsApp|            3|
|     php|            5|
+--------+-------------+



#### using RDD

In [7]:
rdd = data.rdd

In [8]:
rdd.collect()

[Row(STATUS='ERROR:', TECH='php', DATE='21/05/2015'),
 Row(STATUS='DONE:', TECH='php', DATE='11/01/2016'),
 Row(STATUS='ERROR:', TECH='RailsApp', DATE='05/08/2015'),
 Row(STATUS='ERROR:', TECH='php', DATE='19/05/2015'),
 Row(STATUS='DONE:', TECH='mysql', DATE='23/01/2016'),
 Row(STATUS='ERROR:', TECH='php', DATE='13/02/2016'),
 Row(STATUS='ERROR:', TECH='php', DATE='22/11/2014'),
 Row(STATUS='ERROR:', TECH='RailsApp', DATE='25/12/2015'),
 Row(STATUS='ERROR:', TECH='mysql', DATE='18/03/2015'),
 Row(STATUS='DONE:', TECH='php', DATE='22/08/2015'),
 Row(STATUS='ERROR:', TECH='RailsApp', DATE='06/05/2015'),
 Row(STATUS='ERROR:', TECH='php', DATE='09/03/2015'),
 Row(STATUS='DONE:', TECH='mysql', DATE='28/06/2015')]

In [11]:
errorData = rdd.filter(lambda row :row["STATUS"] == "ERROR:")

In [12]:
errorData.collect()

[Row(STATUS='ERROR:', TECH='php', DATE='21/05/2015'),
 Row(STATUS='ERROR:', TECH='RailsApp', DATE='05/08/2015'),
 Row(STATUS='ERROR:', TECH='php', DATE='19/05/2015'),
 Row(STATUS='ERROR:', TECH='php', DATE='13/02/2016'),
 Row(STATUS='ERROR:', TECH='php', DATE='22/11/2014'),
 Row(STATUS='ERROR:', TECH='RailsApp', DATE='25/12/2015'),
 Row(STATUS='ERROR:', TECH='mysql', DATE='18/03/2015'),
 Row(STATUS='ERROR:', TECH='RailsApp', DATE='06/05/2015'),
 Row(STATUS='ERROR:', TECH='php', DATE='09/03/2015')]

In [18]:
grpByError = errorData.groupBy(lambda row : row["TECH"])

In [21]:
grpByError.collect()

[('php', <pyspark.resultiterable.ResultIterable at 0x21c32bae880>),
 ('RailsApp', <pyspark.resultiterable.ResultIterable at 0x21c32ca9880>),
 ('mysql', <pyspark.resultiterable.ResultIterable at 0x21c32ca9580>)]

In [22]:
countErrors = grpByError.map(lambda x : (x[0], len(list(x[1]))))

In [24]:
countErrors.collect()

[('php', 5), ('RailsApp', 3), ('mysql', 1)]

In [25]:
spark.stop()