In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('IMDF').getOrCreate()

In [3]:
df = spark.read.csv('incident_event_log.csv',inferSchema=True,header=True)

In [4]:
df.printSchema()

root
 |-- number: string (nullable = true)
 |-- incident_state: string (nullable = true)
 |-- active: boolean (nullable = true)
 |-- reassignment_count: integer (nullable = true)
 |-- reopen_count: integer (nullable = true)
 |-- sys_mod_count: integer (nullable = true)
 |-- made_sla: boolean (nullable = true)
 |-- caller_id: string (nullable = true)
 |-- opened_by: string (nullable = true)
 |-- opened_at: string (nullable = true)
 |-- sys_created_by: string (nullable = true)
 |-- sys_created_at: string (nullable = true)
 |-- sys_updated_by: string (nullable = true)
 |-- sys_updated_at: string (nullable = true)
 |-- contact_type: string (nullable = true)
 |-- location: string (nullable = true)
 |-- category: string (nullable = true)
 |-- subcategory: string (nullable = true)
 |-- u_symptom: string (nullable = true)
 |-- cmdb_ci: string (nullable = true)
 |-- impact: string (nullable = true)
 |-- urgency: string (nullable = true)
 |-- priority: string (nullable = true)
 |-- assignment_gr

In [5]:
from pyspark.sql.functions import datediff,date_format,to_date,to_timestamp

In [6]:
df=df.withColumn('resolved_ts',to_timestamp(df.resolved_at, 'dd/MM/yyyy HH:mm')).\
                withColumn('opened_ts',to_timestamp(df.opened_at, 'dd/MM/yyyy HH:mm')).\
                withColumn('sys_created_ts',to_timestamp(df.sys_created_at, 'dd/MM/yyyy HH:mm')).\
                withColumn('sys_updated_ts',to_timestamp(df.sys_updated_at, 'dd/MM/yyyy HH:mm')).\
                withColumn('closed_ts',to_timestamp(df.closed_at, 'dd/MM/yyyy HH:mm')).\
                withColumn('resolved',to_date(df.resolved_at, 'dd/MM/yyyy HH:mm')).\
                withColumn('opened',to_date(df.opened_at, 'dd/MM/yyyy HH:mm')).\
                withColumn('sys_created',to_date(df.sys_created_at, 'dd/MM/yyyy HH:mm')).\
                withColumn('sys_updated',to_date(df.sys_updated_at, 'dd/MM/yyyy HH:mm')).\
                withColumn('closed',to_date(df.closed_at, 'dd/MM/yyyy HH:mm')).\
                withColumn('duration',datediff(to_date(df.resolved_at, 'dd/MM/yyyy HH:mm'),to_date(df.opened_at, 'dd/MM/yyyy HH:mm')))

In [7]:
df_unique_incidents=df.filter("incident_state=='Closed'").sort("sys_mod_count",ascending=False).dropDuplicates(["number"])

#### 1. Top 5 people with most resolved incidents

In [8]:
A1=df_unique_incidents.groupby("resolved_by").count().sort("count",ascending=False)

In [9]:
A1.show(n=5)

+---------------+-----+
|    resolved_by|count|
+---------------+-----+
| Resolved by 11| 3071|
| Resolved by 15| 2415|
|Resolved by 103|  689|
|Resolved by 177|  686|
| Resolved by 32|  597|
+---------------+-----+
only showing top 5 rows



#### 2. Based on least average duration, find the top 5 people with maxmium number of incidents resolved

In [10]:
from pyspark.sql import functions as F

In [11]:
A2= df_unique_incidents.groupby("resolved_by").agg(F.count("duration"),F.mean("duration")).\
    withColumnRenamed("count(duration)","Incidents Resolved").\
    withColumnRenamed("avg(duration)","Average Duration").\
    orderBy(["Average Duration","Incidents Resolved"],ascending=[True,False])

In [12]:
A2.show(n=5)

+---------------+------------------+----------------+
|    resolved_by|Incidents Resolved|Average Duration|
+---------------+------------------+----------------+
| Resolved by 10|                 4|             0.0|
| Resolved by 94|                 4|             0.0|
| Resolved by 26|                 2|             0.0|
|Resolved by 145|                 2|             0.0|
| Resolved by 39|                 1|             0.0|
+---------------+------------------+----------------+
only showing top 5 rows



#### 3. People with maximum number of high impact incidents resolved

In [13]:
A3= df_unique_incidents.select(["resolved_by","impact","duration"]).\
    groupby(["impact","resolved_by"]).count().\
    sort(["impact","count"],ascending=[True,False])

In [14]:
A3.show(n=10)

+--------+---------------+-----+
|  impact|    resolved_by|count|
+--------+---------------+-----+
|1 - High| Resolved by 98|   20|
|1 - High|Resolved by 137|   17|
|1 - High| Resolved by 11|   15|
|1 - High|Resolved by 165|   13|
|1 - High|Resolved by 111|   12|
|1 - High|  Resolved by 6|   12|
|1 - High|Resolved by 223|   10|
|1 - High|Resolved by 139|    9|
|1 - High|Resolved by 150|    9|
|1 - High| Resolved by 91|    9|
+--------+---------------+-----+
only showing top 10 rows



#### 4a. In each impact levels, find the person with most number of incidents resolved

In [15]:
A4a=df_unique_incidents.select(["resolved_by","impact","duration"]).\
    groupby(["impact","resolved_by"]).count().\
    sort(["impact","count"],ascending=[True,False]).\
    dropDuplicates(["impact"])

In [16]:
A4a.show()

+----------+--------------+-----+
|    impact|   resolved_by|count|
+----------+--------------+-----+
|  1 - High|Resolved by 98|   20|
|2 - Medium|Resolved by 11| 3045|
|   3 - Low|Resolved by 66|  194|
+----------+--------------+-----+



#### 4b. In each urgency levels, find the person with most number of incidents resolved

In [17]:
A4b=df_unique_incidents.select(["resolved_by","urgency","duration"]).\
    groupby(["urgency","resolved_by"]).count().\
    sort(["urgency","count"],ascending=[True,False]).dropDuplicates(["urgency"])

In [18]:
A4b.show()

+----------+---------------+-----+
|   urgency|    resolved_by|count|
+----------+---------------+-----+
|  1 - High|Resolved by 166|   38|
|2 - Medium| Resolved by 11| 3047|
|   3 - Low| Resolved by 66|  195|
+----------+---------------+-----+



#### 4c. In each priority levels, find the person with most number of incidents resolved

In [19]:
A4c=df_unique_incidents.select(["resolved_by","priority","duration"]).\
    groupby(["priority","resolved_by"]).count().\
    sort(["priority","count"],ascending=[True,False]).dropDuplicates(["priority"])

In [20]:
A4c.show()

+------------+---------------+-----+
|    priority|    resolved_by|count|
+------------+---------------+-----+
|1 - Critical| Resolved by 98|   16|
|3 - Moderate| Resolved by 11| 3040|
|     4 - Low| Resolved by 66|  195|
|    2 - High|Resolved by 166|   40|
+------------+---------------+-----+



#### 5. Find each contact type as a percentage of total incidents

In [21]:
from pyspark.sql.window import Window

In [22]:
A5= df_unique_incidents.select(["contact_type"]).\
    groupby(["contact_type"]).count().\
    withColumn("percentage",F.round(F.col("count")*100/F.sum("count").over(Window.partitionBy()),2))

In [23]:
A5.show()

+--------------+-----+----------+
|  contact_type|count|percentage|
+--------------+-----+----------+
|         Phone|24688|     99.08|
|         Email|   59|      0.24|
|  Self service|  158|      0.63|
|           IVR|    9|      0.04|
|Direct opening|    4|      0.02|
+--------------+-----+----------+



#### 6. On each priority level, find the percentage of incidents which made SLA and which did not.

In [24]:
A6= df_unique_incidents.select(["priority","made_sla"]).\
    groupby(["priority","made_sla"]).count().\
    withColumnRenamed("count","Population").\
    withColumn("Made SLA %",F.round(F.col("Population")*100/F.sum("Population").over(Window.partitionBy("priority")),2)).\
    sort(["priority","made_sla"],ascending=[True,False])

In [25]:
A6.show()

+------------+--------+----------+----------+
|    priority|made_sla|Population|Made SLA %|
+------------+--------+----------+----------+
|1 - Critical|    true|         5|      1.85|
|1 - Critical|   false|       265|     98.15|
|    2 - High|    true|         2|      0.49|
|    2 - High|   false|       406|     99.51|
|3 - Moderate|    true|     15145|     64.54|
|3 - Moderate|   false|      8321|     35.46|
|     4 - Low|    true|       651|     84.11|
|     4 - Low|   false|       123|     15.89|
+------------+--------+----------+----------+



#### 7. Top 5 location with the maximum number of incidents reported

In [26]:
A7= df_unique_incidents.groupby(["location"]).agg({"number":"count"}).\
    withColumnRenamed("count(number)","Incidents Reported").sort(["Incidents Reported"],ascending=False)

In [27]:
A7.show(5)

+------------+------------------+
|    location|Incidents Reported|
+------------+------------------+
|Location 204|              5554|
|Location 161|              4002|
|Location 143|              3276|
|Location 108|              2140|
| Location 93|              1934|
+------------+------------------+
only showing top 5 rows



#### 8. Which category of issues missed meeting the SLA the most?

In [28]:
A8= df_unique_incidents.filter("made_sla==false").groupby(["category"]).\
    agg({"made_sla":"count"}).withColumnRenamed("count(made_sla)","Incidents failed to make SLA").\
    sort(["Incidents failed to make SLA"],ascending=False)

In [29]:
A8.show(5)

+-----------+----------------------------+
|   category|Incidents failed to make SLA|
+-----------+----------------------------+
|Category 46|                        1254|
|Category 26|                        1017|
|Category 53|                        1009|
|Category 42|                         689|
|Category 23|                         505|
+-----------+----------------------------+
only showing top 5 rows

