# Functions

In [24]:
import pandas as pd
import sqlite3

def connect():
    conn = sqlite3.connect('base_bacen.sqlite')
    return conn

#Variable to connect
conn = connect()

#Function to execute queries
def query(conn, sql_query):
  try:
      df = pd.read_sql(sql_query, con=conn)
  except TypeError:
      df = None
  return df
  
#Function to find NULL by one column
def select_null(column, table):
  query_select_null = 'SELECT ' + column + ' ' + \
    'FROM ' + table + ' ' + \
    'WHERE ' + column + ' = "nan" ' + \
    'OR ' + column + ' = NULL'
  return query_select_null
 
#Function to find all table NULLs 
def verify_all_table_nulls(list_columns, table):
  for column in list_columns:
    select_null_by_column = select_null(column, table)
    null_columns = query(conn, select_null_by_column)
    if (null_columns.empty):
      print("Column " + column + ": 0")
    else:
      print("Column " + column + ": " + str(len(null_columns)))

#Function that selects Mode of a column
def select_mode(column, table):
  mode = 'SELECT ' + column + ' ' + \
    'AS ' + 'mode_' + column + ' ' + \
    'FROM ' + table + ' ' \
    'GROUP BY ' + column + ' ' + \
    'ORDER  BY COUNT(*) DESC ' + \
    'LIMIT 1'
  return mode

#Function that selects Mean of a column
def select_mean(column, table):
  mean = 'SELECT AVG(' + column + ') ' + \
    'as ' + 'mean_' + column + ' ' + \
    'FROM ' + table
  return mean

#Function that selects Median of a column
def select_median(column, table):
  median = 'SELECT AVG(' + column + ') ' + \
   'AS ' + 'median_' + column + ' ' + \
    'FROM (SELECT ' + column + ' ' + \
    'FROM ' + table + ' ' + \
      'ORDER BY ' + column + ' ' + \
       'LIMIT 2 - (SELECT COUNT(*) FROM ' + table + ') % 2 ' + \
    'OFFSET (SELECT (COUNT(*) - 1) / 2 ' + \
       'FROM ' + table + '))'
  return median

#Function to look for min and max outliers
def print_outliers(min, max):
  min_outlier = str(min)
  max_outlier = str(max)
  print('Min Outlier\n')

  #Looking for min outliers
  for column in table_columns:
    outliers_query = 'SELECT ' + column + ' FROM ' + table_name + ' ' + \
            'WHERE CAST(' + column + ' AS INTEGER) < ' + min_outlier
    outlier = query(conn, outliers_query)
    print("Column " + column + ": " + str(len(outlier)))

  #Looking for max outliers
  print('\nMax Outlier\n')

  for column in table_columns:
    outliers_query =  'SELECT ' + column + ' FROM ' + table_name + ' ' + \
            'WHERE CAST(' + column + ' AS INTEGER) > ' + max_outlier
    outlier = query(conn, outliers_query)
    print("Column " + column + ": " + str(len(outlier)))

# Table Info

In [2]:
#Table Name
table_name = 'scr'

#List of table columns
table_columns = [
  "valor_credito_vencer_ate_30_dia",
  "valor_credito_vencer_31_60_dia" ,
  "valor_credito_vencer_61_90_dia" ,
  "valor_credito_vencer_acima_90_dia" ,
  "valor_credito_vencido_15_30_dia" ,
  "valor_credito_vencido_31_60_dia" ,
  "valor_credito_vencido_61_90_dia" ,
  "valor_credito_vencido_acima_90_dia"
]

# Perfiming Credit Analysis

In [29]:
#Describing df with Pandas
df = query(conn, 'SELECT * FROM scr')

#Converting to float to assure they're on the right type
for column in table_columns:
  df[column] = df[column].astype(float)

df.describe()

Unnamed: 0,valor_credito_vencer_ate_30_dia,valor_credito_vencer_31_60_dia,valor_credito_vencer_61_90_dia,valor_credito_vencer_acima_90_dia,valor_credito_vencido_15_30_dia,valor_credito_vencido_31_60_dia,valor_credito_vencido_61_90_dia,valor_credito_vencido_acima_90_dia
count,272619.0,272619.0,272619.0,272619.0,259347.0,272619.0,272619.0,272619.0
mean,1054.678767,403.670035,301.1223,8842.787,45.20521,56.876465,63.074537,295.9231
std,3730.736723,1453.834721,2639.395,2298424.0,638.408454,1012.533892,1266.193974,5617.983
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,200.93,8.15,0.0,0.0,0.0,0.0,0.0,0.0
75%,838.955,303.49,214.65,1230.335,0.0,0.0,0.0,0.0
max,481007.32,76177.25,1259003.0,1200000000.0,156976.35,280592.1,244155.82,1248027.0


# Looking or Null and NaN

In [5]:
verify_all_table_nulls(table_columns, table_name)

Column valor_credito_vencer_ate_30_dia: 0
Column valor_credito_vencer_31_60_dia: 0
Column valor_credito_vencer_61_90_dia: 0
Column valor_credito_vencer_acima_90_dia: 0
Column valor_credito_vencido_15_30_dia: 13272
Column valor_credito_vencido_31_60_dia: 0
Column valor_credito_vencido_61_90_dia: 0
Column valor_credito_vencido_acima_90_dia: 0


Found 13272 nan values valor_credito_vencido_15_30_dia

# Looking for Outliers

In [30]:
df.groupby(['chave_cpf']).size()

chave_cpf
10003023342    12
10004431332     1
1000721890      8
10008661896    13
10009504506     9
               ..
99988353331     7
99989120865    14
99991920229     8
99994904248    14
99997063221     3
Length: 31450, dtype: int64

In [31]:
df.groupby(['codigo_modalidade_operacao']).size()

codigo_modalidade_operacao
101      1945
1304    81683
202     26124
203     36958
204     47170
210     32064
213     25617
218     20498
406       156
499       404
dtype: int64

Found Product 203 wich wasn't informed by the Product Owner

In [32]:
df.groupby(['valor_credito_vencer_ate_30_dia']).size()

valor_credito_vencer_ate_30_dia
0.00         54037
0.01           226
0.02           196
0.03           137
0.04           129
             ...  
254286.76        1
271779.32        1
315906.75        1
329836.86        1
481007.32        1
Length: 121039, dtype: int64

In [33]:
df.groupby(['valor_credito_vencer_31_60_dia']).size()

valor_credito_vencer_31_60_dia
0.00        130207
0.01            25
0.02            19
0.03            16
0.04            13
             ...  
66022.65         1
67754.69         1
71063.67         1
74029.01         1
76177.25         1
Length: 81473, dtype: int64

In [34]:
df.groupby(['valor_credito_vencer_61_90_dia']).size()

valor_credito_vencer_61_90_dia
0.00          144014
0.01               4
0.02               8
0.03               8
0.04               6
               ...  
56619.84           1
58545.74           1
59777.08           1
98794.38           1
1259003.05         1
Length: 72160, dtype: int64

In [35]:
df.groupby(['valor_credito_vencer_acima_90_dia']).size()

valor_credito_vencer_acima_90_dia
0.000000e+00    139365
1.000000e-02        22
2.000000e-02        37
3.000000e-02        37
4.000000e-02        23
                 ...  
2.086403e+06         1
3.389030e+06         1
4.366009e+06         1
5.051969e+06         1
1.200000e+09         1
Length: 111202, dtype: int64

In [36]:
df.groupby(['valor_credito_vencido_15_30_dia']).size()

valor_credito_vencido_15_30_dia
0.00         242329
0.01             12
0.02              5
0.03              2
0.04              3
              ...  
55902.89          1
59216.48          1
62630.63          1
64642.95          1
156976.35         1
Length: 14733, dtype: int64

In [37]:
df.groupby(['valor_credito_vencido_31_60_dia']).size()

valor_credito_vencido_31_60_dia
0.00         255803
0.01              7
0.02              3
0.03              2
0.04              3
              ...  
93815.74          1
104783.46         1
120942.57         1
123819.38         1
280592.10         1
Length: 14864, dtype: int64

In [38]:
df.groupby(['valor_credito_vencido_61_90_dia']).size()

valor_credito_vencido_61_90_dia
0.00         259741
0.01              4
0.02              2
0.03              1
0.05              4
              ...  
137899.48         1
150010.29         1
162843.48         1
211966.00         1
244155.82         1
Length: 11864, dtype: int64

In [39]:
df.groupby(['valor_credito_vencido_acima_90_dia']).size()

valor_credito_vencido_acima_90_dia
0.00          250055
0.01               8
0.02               8
0.03              10
0.04               5
               ...  
666587.18          1
783425.88          1
861291.80          1
979210.14          1
1248027.21         1
Length: 21417, dtype: int64

In [19]:
#Looking for outliers < 0 or > 1,000,000
print_outliers(0, 1000000)

Min Outlier

Column valor_credito_vencer_ate_30_dia: 0
Column valor_credito_vencer_31_60_dia: 0
Column valor_credito_vencer_61_90_dia: 0
Column valor_credito_vencer_acima_90_dia: 0
Column valor_credito_vencido_15_30_dia: 0
Column valor_credito_vencido_31_60_dia: 0
Column valor_credito_vencido_61_90_dia: 0
Column valor_credito_vencido_acima_90_dia: 0

Max Outlier

Column valor_credito_vencer_ate_30_dia: 0
Column valor_credito_vencer_31_60_dia: 0
Column valor_credito_vencer_61_90_dia: 1
Column valor_credito_vencer_acima_90_dia: 15
Column valor_credito_vencido_15_30_dia: 0
Column valor_credito_vencido_31_60_dia: 0
Column valor_credito_vencido_61_90_dia: 0
Column valor_credito_vencido_acima_90_dia: 1


Found values greater than 1,000,000 but none less than 0

# Mode, Mean and Median

In [25]:
#Select Mode
for column in table_columns:
  select = select_mode(column, table_name)
  message = 'Mode of ' + column + ' = ' + str(query(conn, select).values[0]).replace("'", "").replace("[", "").replace("]", "")
  print(message)

Mode of valor_credito_vencer_ate_30_dia = 0.0
Mode of valor_credito_vencer_31_60_dia = 0.0
Mode of valor_credito_vencer_61_90_dia = 0.0
Mode of valor_credito_vencer_acima_90_dia = 0.0
Mode of valor_credito_vencido_15_30_dia = 0.0
Mode of valor_credito_vencido_31_60_dia = 0.0
Mode of valor_credito_vencido_61_90_dia = 0.0
Mode of valor_credito_vencido_acima_90_dia = 0.0


In [26]:
#Select Mean
for column in table_columns:
  select = select_mean(column, table_name)
  message = 'Mean of ' + column + ' = ' + str(query(conn, select).values[0]).replace("'", "").replace("[", "").replace("]", "")
  print(message)

Mean of valor_credito_vencer_ate_30_dia = 1054.67876747
Mean of valor_credito_vencer_31_60_dia = 403.67003463
Mean of valor_credito_vencer_61_90_dia = 301.12227317
Mean of valor_credito_vencer_acima_90_dia = 8842.78738309
Mean of valor_credito_vencido_15_30_dia = 43.00447016
Mean of valor_credito_vencido_31_60_dia = 56.87646543
Mean of valor_credito_vencido_61_90_dia = 63.07453743
Mean of valor_credito_vencido_acima_90_dia = 295.92307103


In [27]:
#Select Median
for column in table_columns:
  select = select_median(column, table_name)
  message = 'Median of ' + column + ' = ' + str(query(conn, select).values[0]).replace("'", "").replace("[", "").replace("]", "")
  print(message)

Median of valor_credito_vencer_ate_30_dia = 2272.61
Median of valor_credito_vencer_31_60_dia = 1067.68
Median of valor_credito_vencer_61_90_dia = 0.
Median of valor_credito_vencer_acima_90_dia = 0.
Median of valor_credito_vencido_15_30_dia = 0.
Median of valor_credito_vencido_31_60_dia = 0.
Median of valor_credito_vencido_61_90_dia = 0.
Median of valor_credito_vencido_acima_90_dia = 0.
