In [9]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
import pandas as pd
from datetime import date, timedelta
from matplotlib import pyplot as plt
import matplotlib as mpl

spark = SparkSession.builder.getOrCreate()

In [10]:
df = spark.read.csv('bank-full.csv',sep=",",header = True,inferSchema = True)

In [11]:
df.show(1,vertical = True)

-RECORD 0---------------
 age       | 58         
 job       | management 
 marital   | married    
 education | tertiary   
 default   | no         
 balance   | 2143       
 housing   | yes        
 loan      | no         
 contact   | unknown    
 day       | 5          
 month     | may        
 duration  | 261        
 campaign  | 1          
 pdays     | -1         
 previous  | 0          
 poutcome  | unknown    
 Target    | no         
only showing top 1 row



## __Parent Class__

In [12]:
from abc import ABC, abstractmethod
class QualityRule(ABC):
    def __init__(self, df, field:str, dtype_expected:str, operator:str, threshold, values:list, min_value, max_value,
                key, df2 , field2, key2, rule_type):
        super().__init__()
        self.data = df
        self.field = field
        self.dtype_expected = dtype_expected
        self.operator = operator
        self.threshold = threshold
        self.values = values
        self.min_value = min_value
        self.max_value = max_value
        self.key = key
        self.df2 = df2
        self.field2 = field2
        self.key2 = key2
        self.rule_type = rule_type
        self.population = self._count_population()
        self.errors_df = self._errors_df()
        self.errors = self._count_errors()
        self.compliance = round(((self.population - self.errors)/self.population) * 100,2)
        
    @abstractmethod
    def _count_population():
        pass
    
    @abstractmethod
    def _errors_df():
        pass
    
    @abstractmethod
    def _count_errors():
        pass

## __Child Class__

### R_1_1

In [13]:
class R_1_1(QualityRule):    
    
    def __init__(self, df, field, dtype_expected):
        super().__init__(df, field, dtype_expected, operator = '-', threshold = '-', values = '-', min_value = '-', max_value = '-',
                        key = '-',df2 = '-', field2 = '-', key2 = '-', rule_type = '1.1')
        
    def _count_population(self):
        return len(self.data.select(self.field).dtypes[0][1].split())
    
    def _errors_df(self):
        return '-'
        
    def _count_errors(self):
        if self.data.select(self.field).dtypes[0][1] == self.dtype_expected:
            return 0
        else:
            return 1

In [14]:
R11 = R_1_1(df, "job", "str")

In [15]:
R11.rule_type

'1.1'

### R_1_2

In [16]:
class R_1_2(QualityRule):

    def __init__(self, df, field):
        super().__init__(df, field, dtype_expected = '-', operator = '-', threshold = '-', values = '-',min_value = '-',
                         max_value = '-',key = '-',df2 = '-', field2 = '-', key2 = '-', rule_type = '1.2')
        
    def _count_population(self):
        return self.data.count()
    
    def _errors_df(self):
        return self.data.filter(F.col(self.field).isNull())
    
    def _count_errors(self):
        return self.errors_df.count()

In [17]:
a = R_1_2(df, "job")

In [18]:
a.population

45211

### R_1_3_D

In [19]:
class R_1_3_D(QualityRule):    
    
    def __init__(self, df, field, values):
        super().__init__(df = df, field = field, dtype_expected = '-', operator = '-', threshold = '-',
                         values = values, min_value = '-', max_value = '-', key = '-', df2 = '-', field2 = '-', key2 = '-', rule_type = '1.3_D')
        
    def _count_population(self):
        return self.data.count()
    
    def _errors_df(self):
        self.data.filter(F.col(self.field).isin(self.values))
        
    def _count_errors(self):
        return self.errors_df.count()    

### R_1_3_C

In [20]:
class R_1_3_C(QualityRule):    
    
    def __init__(self, df, field, operator, threshold):
        super().__init__(df = df, field = field, dtype_expected = '-', operator = operator, threshold = threshold,
                         values = '-', min_value = '-', max_value = '-', key = '-', df2 = '-', field2 = '-', key2 = '-', rule_type = '1.3_C')
        
    def _count_population(self):
        return self.data.count()
    
    def _errors_df(self):
        
        if self.operator == '<=':
            return self.data.filter(F.col(self.field) <= self.threshold)
        elif self.operator == '<':
            return self.data.filter(F.col(self.field) < self.threshold)
        elif self.operator == '>':
            return self.data.filter(F.col(self.field) > self.threshold)
        elif self.operator == '>=':
            return self.data.filter(F.col(self.field) >= self.threshold)
        elif self.operator == '==':
            return self.data.filter(F.col(self.field) == self.threshold)
        elif self.operator == '!=':
            return self.data.filter(F.col(self.field) != self.threshold)
        
    def _count_errors(self):
        return self.errors_df.count()    

In [22]:
d = R_1_3_C(df, "balance", "<", 0)

In [23]:
d.compliance

91.67

### R_1_4

In [24]:
class R_1_4(QualityRule):    
    
    def __init__(self, df, field:str, min_value, max_value):
        super().__init__(df = df, field = field, dtype_expected = '-', operator = '-', threshold = '-',
                         values = '-', min_value = min_value, max_value = max_value, key = '-', df2 = '-', field2 = '-', key2 = '-', rule_type = '1.4')
        
    def _count_population(self):
        return self.data.count()
    
    def _errors_df(self):
        return self.data.filter((F.col(self.field) < self.min_value) | (F.col(self.field) > self.max_value))
        
    def _count_errors(self):
        return self.errors_df.count()    

In [28]:
f = R_1_4(df, "balance", 0, 99999)

In [27]:
f.errors

3767

### R_1_5_E

In [32]:
class R_1_5(QualityRule):    
    
    def __init__(self, df, field:str, values):
        super().__init__(df = df, field = field, dtype_expected = '-', operator = '-', threshold = '-',
                         values = values, min_value = '-', max_value = '-', key = '-', df2 = '-', field2 = '-', key2 = '-', rule_type = '1.5')
        
    def _count_population(self):
        return self.data.count()
    
    def _errors_df(self):
        return self.data.filter(~F.col(self.field).isin(self.values))
        
    def _count_errors(self):
        return self.errors_df.count()

In [33]:
R15 = R_1_5(df, "housing", ["yes","no"])

In [34]:
R15.compliance

100.0

### R_1_6

In [37]:
class R_1_6(QualityRule):    
    
    def __init__(self, df, field:str):
        super().__init__(df = df, field = field, dtype_expected = '-', operator = '-', threshold = '-',
                         values = '-', min_value = '-', max_value = '-', key = '-', df2 = '-', field2 = '-', key2 = '-', rule_type = '1.6')
        
    def _count_population(self):
        return self.data.count()
    
    def _errors_df(self):
        return self.data.withColumn('blanks', F.length(F.regexp_replace(self.field, '[^ ]+', ''))).filter(F.col('blanks')!= 0)
        
    def _count_errors(self):
        return self.errors_df.count()    

In [38]:
R16 = R_1_6(df,'marital')

In [39]:
R16.errors

0

### R_1_7

In [41]:
class R_1_7(QualityRule):
    
    def __init__(self, df, field):
        super().__init__(df, field, dtype_expected='-', operator = '-', threshold = '-', values = '-', min_value = '-', max_value = '-',
                        key = '-',df2 = '-', field2 = '-', key2 = '-', rule_type = '1.7')
        
    def _count_population(self):
        return self.data.count()
    
    def _errors_df(self):
        return self.data.select(self.field).groupBy(self.field).count().withColumn('errores',F.col('count')-1)
    
    def _count_errors(self):
        return self.errors_df.select(F.sum('errores')).collect()[0][0]

In [42]:
R17 = R_1_7(df, "age")

In [43]:
R17.errors

45134

# Funciones

In [None]:
def finish_report(container: pd.DataFrame):
        reporte_df = (spark
                      .createDataFrame(container)
                      .withColumn('ID', F.monotonically_increasing_id())
                      .withColumn('ID_order', F.row_number().over(Window.orderBy('ID')))
                      .withColumn('OKs', F.col('Poblacion')-F.col('Errores'))
                      .select('ID_order', 'Field','Secondary_Field','Object', 'Rule_Type', 'Population', 'OKs', 'Errors', 'Compliance'))
        return reporte_df

In [None]:
def plot_report(reporte_df: pd.DataFrame):
    
    fig, ax = plt.subplots(figsize=(15,8), facecolor=(.94, .94, .94))

    reglas = reporte_df.select('ID_order').rdd.flatMap(lambda x:x).collect()
    cumplimiento = reporte_df.select('Compliance').rdd.flatMap(lambda x:x).collect()

    cols = ['red' if x < 90 else 'green' if x > 90 else 'yellow' for x in cumplimiento]
    bars = ax.barh(reglas,cumplimiento, color = cols, align='center')
    plt.axvline(x = 90, color = 'red', ls='--')
    plt.axvline(x = 95, color = 'yellow', ls='--')
    
    ax.set_facecolor('#eafff5')
    #etiqueta de barras
    ax.bar_label(bars, fmt= '{:,.2f}%', label_type ='center', color = 'white')
    #Formato de eje y
    ax.xaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}%'))
    #Etiquetas de ejes
    ax.set(ylabel='ID Rule', xlabel = 'Compliance')
    #Titulo
    title = plt.title('Quality Rules Results',fontsize=18,pad=20)
    title.set_position([.12, 1])
    plt.show()

In [None]:
def compliance(df: pd.DataFrame , by = 'Total'): 
    
    print(' Cantidad de Reglas: ', df.count(),'\n',
          'Cantidad de objetos: ', df.select('Object').distinct().count(),'\n',
          'Cantidad de campos: ', df.select('Field').distinct().count(),'\n',
          'Reglas por campo: ',round(df.count() / df.select('Field').distinct().count(),2),'\n',
          'Reglas por objeto: ',round(df.count() / df.select('Object').distinct().count()),2,'\n')
      
    if by == 'Total':
        return print('% de Cumplimiento Total: ', df.select(F.round(F.avg('Compliance'),2)).collect()[0][0])
    
    elif by == 'Objects':
        obj_cump = df.select('Object','Compliance').groupBy('Object').agg(F.round(F.avg('Compliance'),2).alias('mean'))
        objects = obj_cump.select('Object').rdd.flatMap(lambda x:x).collect()
        cump = obj_cump.select('mean').rdd.flatMap(lambda x:x).collect()
        for i,j in zip(objects, cump):
            print('% de Cumplimiento -', i,':', j)
            
    elif by == 'Fields':
        fields_cump = df.select('Field','Compliance').groupBy('Field').agg(F.round(F.avg('Compliance'),2).alias('mean'))
        fields = fields_cump.select('Field').rdd.flatMap(lambda x:x).collect()
        cump = fields_cump.select('mean').rdd.flatMap(lambda x:x).collect()
        for i,j in zip(fields, cump):
            print('% de Cumplimiento -', i,':', j)