In [1]:
import pandas as pd

In [49]:
original_file = pd.read_csv('log.csv') 

file = original_file.copy()

In [50]:
file.head(3)

Unnamed: 0,trackerId,task,status,server_region,timestamp,server,state,country,datetime
0,89328463487,Data quality assurance,success,Florida;USA,1599093305000,464f:bdbd:1d:535a:2dad:5117:a92f:b359,Florida,USA,2020-09-03 00:35:05
1,89328463488,Designing and implementing data models,success,Ontario;Canada,1589218599000,1366:c314:8d40:4219:4a51:138f:7ce3:d244,Ontario,Canada,2020-05-11 17:36:39
2,89328463489,Extracting data from various sources,success,California;USA,1595463446000,1ffa:cdd1:7a2b:a7d8:ab00:eada:94b3:7a7c,California,USA,2020-07-23 00:17:26


## Group the data into tasks and count the number of operations per task

In [143]:
by_task = file.groupby(['task']).count()

by_task.head(9)

Unnamed: 0_level_0,trackerId,status,server_region,timestamp,server,state,country,datetime
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Building data pipelines,102,102,102,102,102,102,102,102
Data governance,116,116,116,116,116,116,116,116
Data integration,100,100,100,100,100,100,100,100
Data quality assurance,111,111,111,111,111,111,111,111
Designing and implementing data models,79,79,79,79,79,79,79,79
Extracting data from various sources,201,201,201,201,201,201,201,201
Loading data into a data warehouse,106,106,106,106,106,106,106,106
Optimizing database performance,83,83,83,83,83,83,83,83
Transforming data into a unified format,102,102,102,102,102,102,102,102


## Group by country and count 

In [116]:
# group by country and count 
by_country = file.groupby(['country']).count()

by_country.head()

Unnamed: 0_level_0,trackerId,task,status,server_region,timestamp,server,state,datetime
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Australia,128,128,128,128,128,128,128,128
Canada,124,124,124,124,124,124,124,124
France,124,124,124,124,124,124,124,124
Japan,133,133,133,133,133,133,133,133
USA,491,491,491,491,491,491,491,491


##  Group by month and count

In [61]:
file['datetime'] =  pd.to_datetime(file['datetime'])  #convert datetime column to datetime

by_month = file.groupby(pd.Grouper(key='datetime', freq='M')).count() # group by month

by_month.head()

Unnamed: 0_level_0,trackerId,task,status,server_region,timestamp,server,state,country
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-04-30,124,124,124,124,124,124,124,124
2020-05-31,117,117,117,117,117,117,117,117
2020-06-30,127,127,127,127,127,127,127,127
2020-07-31,103,103,103,103,103,103,103,103
2020-08-31,113,113,113,113,113,113,113,113


##  See the successful executed tasks

In [104]:
status_success = file[['country', 'task', 'status']][(file['status'] == 'success')]

status_success.head(3)

Unnamed: 0,country,task,status
0,USA,Data quality assurance,success
1,Canada,Designing and implementing data models,success
2,USA,Extracting data from various sources,success


## Group successful tasks by task

In [124]:
status_success_bytask = status_success.groupby(['task']).count()

status_success_bytask.head()

Unnamed: 0_level_0,country,status
task,Unnamed: 1_level_1,Unnamed: 2_level_1
Building data pipelines,76,76
Data governance,90,90
Data integration,74,74
Data quality assurance,87,87
Designing and implementing data models,57,57


## See the failed operations

In [107]:
status_fail = file[['country', 'task', 'status']][(file['status'] == 'failed')]

status_fail.head(3)

Unnamed: 0,country,task,status
24,USA,Extracting data from various sources,failed
42,USA,Optimizing database performance,failed
45,USA,Building data pipelines,failed


## 4. See the successful operations for 'Extracting data from various sources' in USA

In [99]:
status_success_usa = file[['task', 'status', 'country']][(file['task'] == 'Extracting data from various sources') & (file['status']== 'success') & (file['country'] == 'USA')]

status_success_usa.head()

Unnamed: 0,task,status,country
2,Extracting data from various sources,success,USA
18,Extracting data from various sources,success,USA
73,Extracting data from various sources,success,USA
84,Extracting data from various sources,success,USA
88,Extracting data from various sources,success,USA


## Working with Pyspark

In [129]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import *

spark = SparkSession.builder.appName('pyspark_analyses').getOrCreate()  # creating a new sparksession



In [130]:
log_file = spark.read.option('header', 'true').csv('log.csv')  # reading file

log_file.show(5)

+-----------+--------------------+-------+--------------------+-------------+--------------------+----------+---------+-------------------+
|  trackerId|                task| status|       server_region|    timestamp|              server|     state|  country|           datetime|
+-----------+--------------------+-------+--------------------+-------------+--------------------+----------+---------+-------------------+
|89328463487|Data quality assu...|success|         Florida;USA|1599093305000|464f:bdbd:1d:535a...|   Florida|      USA|2020-09-03 00:35:05|
|89328463488|Designing and imp...|success|      Ontario;Canada|1589218599000|1366:c314:8d40:42...|   Ontario|   Canada|2020-05-11 17:36:39|
|89328463489|Extracting data f...|success|      California;USA|1595463446000|1ffa:cdd1:7a2b:a7...|California|      USA|2020-07-23 00:17:26|
|89328463490|Loading data into...|success|         Florida;USA|1605932487000|8ac5:9010:f24c:2f...|   Florida|      USA|2020-11-21 04:21:27|
|89328463491|Data qu

In [131]:
log_file.createOrReplaceTempView('log_file')   ## This allows us to query with sql syntax

In [132]:
spark.sql('SELECT * FROM log_file' ).show(5) # reading our file


+-----------+--------------------+-------+--------------------+-------------+--------------------+----------+---------+-------------------+
|  trackerId|                task| status|       server_region|    timestamp|              server|     state|  country|           datetime|
+-----------+--------------------+-------+--------------------+-------------+--------------------+----------+---------+-------------------+
|89328463487|Data quality assu...|success|         Florida;USA|1599093305000|464f:bdbd:1d:535a...|   Florida|      USA|2020-09-03 00:35:05|
|89328463488|Designing and imp...|success|      Ontario;Canada|1589218599000|1366:c314:8d40:42...|   Ontario|   Canada|2020-05-11 17:36:39|
|89328463489|Extracting data f...|success|      California;USA|1595463446000|1ffa:cdd1:7a2b:a7...|California|      USA|2020-07-23 00:17:26|
|89328463490|Loading data into...|success|         Florida;USA|1605932487000|8ac5:9010:f24c:2f...|   Florida|      USA|2020-11-21 04:21:27|
|89328463491|Data qu

## Group the data into tasks and count the number of operations per task

In [144]:
spark.sql('SELECT distinct task, count(task) AS number_of_tasks  FROM log_file GROUP BY task ORDER BY number_of_tasks desc').show() # reading our file


+--------------------+---------------+
|                task|number_of_tasks|
+--------------------+---------------+
|Extracting data f...|            201|
|     Data governance|            116|
|Data quality assu...|            111|
|Loading data into...|            106|
|Transforming data...|            102|
|Building data pip...|            102|
|    Data integration|            100|
|Optimizing databa...|             83|
|Designing and imp...|             79|
+--------------------+---------------+



## Group the data into tasks and count the number of tasks per country

In [162]:
spark.sql('SELECT distinct task, country, count(task)  FROM log_file GROUP BY country, task').show() # reading our file


+--------------------+---------+-----------+
|                task|  country|count(task)|
+--------------------+---------+-----------+
|Optimizing databa...|      USA|         49|
|     Data governance|      USA|         49|
|Transforming data...|    Japan|         15|
|Data quality assu...|   Canada|         16|
|Data quality assu...|   France|         14|
|Transforming data...|Australia|         10|
|Loading data into...|   Canada|         12|
|Optimizing databa...|Australia|         11|
|     Data governance|   France|         17|
|Designing and imp...|   Canada|          8|
|    Data integration|      USA|         39|
|Designing and imp...|Australia|         10|
|    Data integration|Australia|         17|
|    Data integration|    Japan|         14|
|    Data integration|   Canada|         19|
|     Data governance|Australia|         16|
|Transforming data...|   Canada|         10|
|Transforming data...|   France|         15|
|Designing and imp...|      USA|         42|
|    Data 

## Show successful tasks

In [169]:
status_success_pyspark = log_file.where(log_file['status'] == 'success')

status_success_pyspark.show(6)

+-----------+--------------------+-------+--------------------+-------------+--------------------+----------+---------+-------------------+
|  trackerId|                task| status|       server_region|    timestamp|              server|     state|  country|           datetime|
+-----------+--------------------+-------+--------------------+-------------+--------------------+----------+---------+-------------------+
|89328463487|Data quality assu...|success|         Florida;USA|1599093305000|464f:bdbd:1d:535a...|   Florida|      USA|2020-09-03 00:35:05|
|89328463488|Designing and imp...|success|      Ontario;Canada|1589218599000|1366:c314:8d40:42...|   Ontario|   Canada|2020-05-11 17:36:39|
|89328463489|Extracting data f...|success|      California;USA|1595463446000|1ffa:cdd1:7a2b:a7...|California|      USA|2020-07-23 00:17:26|
|89328463490|Loading data into...|success|         Florida;USA|1605932487000|8ac5:9010:f24c:2f...|   Florida|      USA|2020-11-21 04:21:27|
|89328463491|Data qu