## Homework
Showing the basics of pandas library

In [59]:
import pandas as pd
import numpy as np

In [60]:
from enum import Enum

class Severity(Enum):
    LOW = 1
    MEDIUM = 2
    HIGH = 3

# a list of strings
error_names = [
    "ArithmeticError", "SyntaxWarning",
    "BrokenPipeError", "BytesWarning",
    "FutureWarning",   "BlockingIOError",
    "BufferError",     "RuntimeWarning",
    "LookupError",     "EnvironmentError"
]
n = len(error_names)

severity = tuple(np.random.randint(1, 4, n)) # a tuple of enums
times_occured = list(np.random.randint(5, 20, n)) # a list of integers
resolution_duration = list(np.random.rand(n) * 100 + 20) # a list of floats

df = pd.DataFrame({'error_names': error_names,
                   'severity': severity,
                   'times_occured': times_occured,
                   'resolution_duration': resolution_duration
                  })
# use error names as index instead of 0..n
df.set_index('error_names', inplace=True)
df.index.name = None
df

Unnamed: 0,severity,times_occured,resolution_duration
ArithmeticError,3,14,60.399129
SyntaxWarning,2,10,23.282903
BrokenPipeError,1,11,81.329382
BytesWarning,2,19,84.0978
FutureWarning,1,6,69.743063
BlockingIOError,2,5,30.726316
BufferError,1,12,44.261336
RuntimeWarning,2,5,108.373334
LookupError,2,9,20.839613
EnvironmentError,2,6,90.305339


In [61]:
# loc: access a row or column with label
df.loc["BrokenPipeError"] # row BrokenPipeError
df.loc[['ArithmeticError', 'EnvironmentError']] # rows ArithmeticError and AssertionError

df.loc[:, "severity"] # all rows and severity column 

# iloc: access a row or column with integer position 
df.iloc[2:, [1,2]] # starting from third row, get 1st index column and 2nd index column
# df.iloc[[True, False, True, False]] # first and third row using boolean array

Unnamed: 0,times_occured,resolution_duration
BrokenPipeError,11,81.329382
BytesWarning,19,84.0978
FutureWarning,6,69.743063
BlockingIOError,5,30.726316
BufferError,12,44.261336
RuntimeWarning,5,108.373334
LookupError,9,20.839613
EnvironmentError,6,90.305339


In [62]:
# view the types of the columns
df.dtypes # severity -> int64, times -> int64

# since severity is an enum, we might want the corresponding name
# let's add a new column of the actual name of the severity
df['severity_name'] = df['severity'].apply(lambda s: Severity(s).name) # apply lambda to severity column
df.dtypes # severity_name -> object, or str in python

# we have more control over types in pandas. 
# let's convert the severity to byte for fun
byte_severity_frame = df.astype({'severity': 'int8'})
byte_severity_frame.dtypes # now severity is 8 bits 

severity                  int8
times_occured            int64
resolution_duration    float64
severity_name           object
dtype: object

In [63]:
# getting the rows that have "Warning" in their names
warnings = df.loc[[True if "Warning" in name else False for name in error_names]]

# let's see the severity of these warnings. 
warnings.loc[:, [True if "severity" in name else False for name in df.columns]]

df.sample()   # a random one
df.head()     # first 5 rows
df.tail()     # last 5  rows
df.describe() # general statistics
df.info()     # info about columns


<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, ArithmeticError to EnvironmentError
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   severity             10 non-null     int64  
 1   times_occured        10 non-null     int64  
 2   resolution_duration  10 non-null     float64
 3   severity_name        10 non-null     object 
dtypes: float64(1), int64(2), object(1)
memory usage: 700.0+ bytes


In [64]:
df.loc[(df["severity"] > 2) & (df["times_occured"] > 10)]

Unnamed: 0,severity,times_occured,resolution_duration,severity_name
ArithmeticError,3,14,60.399129,HIGH


In [65]:
# elements contained in the given values
df.isin({"times_occured": range(9, 15)})

# we chain comparison expressions with boolean logic
df.loc[(df["severity"] > 2) & (df["times_occured"] > 10)]

# but it's more concise to use query 
df.query('severity > 2 and times_occured > 10')

# now let's write the query with the loc kind of way
df.query('severity == 3 and resolution_duration > 70')

# it uses bitwise and for some reason and it doesn't work without paranthesis
df.loc[(df['severity'] == 3) & (df['resolution_duration'] > 70)]

Unnamed: 0,severity,times_occured,resolution_duration,severity_name


In [66]:
# let's add a new column for error descriptions
# kindly provided by gippity themselves
error_descriptions = [
    "Base class for arithmetic errors.",
    "Base class for syntax warnings.",
    "Error when trying to write to a pipe that has been closed.",
    "Base class for warnings about bytes and buffer operations.",
    "Base class for warnings about constructs that will change in the future.", # so funny
    "Error when an operation would block on an object that cannot be interrupted.",
    "Base class for buffer-related errors.",
    "base class for runtime warnings.",
    "base class for lookup errors.",
    "base class for errors raised by the operating system."
]

df['description'] = error_descriptions

# with .str we have access to all python string methods
# we can do simple contains() operations
df[df['description'].str.contains("Base")]

# or we can use regular expressions with a regex flag
df[df['description'].str.contains("[Bb]ase", regex=True)]

Unnamed: 0,severity,times_occured,resolution_duration,severity_name,description
ArithmeticError,3,14,60.399129,HIGH,Base class for arithmetic errors.
SyntaxWarning,2,10,23.282903,MEDIUM,Base class for syntax warnings.
BytesWarning,2,19,84.0978,MEDIUM,Base class for warnings about bytes and buffer...
FutureWarning,1,6,69.743063,LOW,Base class for warnings about constructs that ...
BufferError,1,12,44.261336,LOW,Base class for buffer-related errors.
RuntimeWarning,2,5,108.373334,MEDIUM,base class for runtime warnings.
LookupError,2,9,20.839613,MEDIUM,base class for lookup errors.
EnvironmentError,2,6,90.305339,MEDIUM,base class for errors raised by the operating ...


In [67]:
# what IS resolution_duration? is it seconds, milliseconds, hours?
# turns out, someone made a mistake and accidentally put daily temperatures (in Fahrenheit)
# so it wasn't even some kind of duration
# we don't mind additional info, so we have to relabel it and perhaps convert it to an understandable unit, too
# C = (F - 32) * 5/9
df['daily_temperature(°C)'] = (df['resolution_duration'] - 32) * (5/9)

# let's multiply resolution duration by pi for good measure
df['resolution_duration'] *= np.pi

# rename the resolution duration to include the correct unit
df.rename(columns={'resolution_duration' : 'resolution_duration (ms)'}, inplace=True)

# cool. but there's that specific outlier that is FutureWarning. 
# i don't need no warning for future. it's temperature is low too. let's drop it
no_future = df.drop(["FutureWarning"])
no_future

Unnamed: 0,severity,times_occured,resolution_duration (ms),severity_name,description,daily_temperature(°C)
ArithmeticError,3,14,189.749461,HIGH,Base class for arithmetic errors.,15.777294
SyntaxWarning,2,10,73.145396,MEDIUM,Base class for syntax warnings.,-4.842832
BrokenPipeError,1,11,255.503791,LOW,Error when trying to write to a pipe that has ...,27.405212
BytesWarning,2,19,264.201029,MEDIUM,Base class for warnings about bytes and buffer...,28.943222
BlockingIOError,2,5,96.529567,MEDIUM,Error when an operation would block on an obje...,-0.707602
BufferError,1,12,139.051088,LOW,Base class for buffer-related errors.,6.811853
RuntimeWarning,2,5,340.464871,MEDIUM,base class for runtime warnings.,42.42963
LookupError,2,9,65.469574,MEDIUM,base class for lookup errors.,-6.200215
EnvironmentError,2,6,283.70259,MEDIUM,base class for errors raised by the operating ...,32.391855


In [68]:

df.rename(columns={'resolution_duration' : 'resolution_duration (ms)'}, inplace=True)
df

Unnamed: 0,severity,times_occured,resolution_duration (ms),severity_name,description,daily_temperature(°C)
ArithmeticError,3,14,189.749461,HIGH,Base class for arithmetic errors.,15.777294
SyntaxWarning,2,10,73.145396,MEDIUM,Base class for syntax warnings.,-4.842832
BrokenPipeError,1,11,255.503791,LOW,Error when trying to write to a pipe that has ...,27.405212
BytesWarning,2,19,264.201029,MEDIUM,Base class for warnings about bytes and buffer...,28.943222
FutureWarning,1,6,219.104296,LOW,Base class for warnings about constructs that ...,20.968369
BlockingIOError,2,5,96.529567,MEDIUM,Error when an operation would block on an obje...,-0.707602
BufferError,1,12,139.051088,LOW,Base class for buffer-related errors.,6.811853
RuntimeWarning,2,5,340.464871,MEDIUM,base class for runtime warnings.,42.42963
LookupError,2,9,65.469574,MEDIUM,base class for lookup errors.,-6.200215
EnvironmentError,2,6,283.70259,MEDIUM,base class for errors raised by the operating ...,32.391855


In [69]:
# cool. but there's that specific outlier that is FutureWarning. 
# i don't need no warning for future. it's temperature is low too. let's drop it
no_future = df.drop(["FutureWarning"])
no_future

Unnamed: 0,severity,times_occured,resolution_duration (ms),severity_name,description,daily_temperature(°C)
ArithmeticError,3,14,189.749461,HIGH,Base class for arithmetic errors.,15.777294
SyntaxWarning,2,10,73.145396,MEDIUM,Base class for syntax warnings.,-4.842832
BrokenPipeError,1,11,255.503791,LOW,Error when trying to write to a pipe that has ...,27.405212
BytesWarning,2,19,264.201029,MEDIUM,Base class for warnings about bytes and buffer...,28.943222
BlockingIOError,2,5,96.529567,MEDIUM,Error when an operation would block on an obje...,-0.707602
BufferError,1,12,139.051088,LOW,Base class for buffer-related errors.,6.811853
RuntimeWarning,2,5,340.464871,MEDIUM,base class for runtime warnings.,42.42963
LookupError,2,9,65.469574,MEDIUM,base class for lookup errors.,-6.200215
EnvironmentError,2,6,283.70259,MEDIUM,base class for errors raised by the operating ...,32.391855
