**Renaming columns**

In [1]:
import re 
from unidecode import unidecode

def rename_columns(df):
    """Standardizes the column name of any df to snake_case pattern.
    """
    for col in df.columns:
        new_col_name = re.sub('\W','',unidecode(col.lower().strip().replace(' ','_')))
        df.rename(columns = {col:new_col_name}, inplace = True)

# Understanding the code:
# re.sub('\W','','string!@#') -- remove any non alphanumeric character. Return 'string'
# unidecode('áçê') -- Remove accents and special caracters. Return 'ace'
# 'STRING'.lower() -- transform to lowercase. Return 'string'
# ' string '.strip() -- remove whitespaces at the beginning and end of the string. Return 'string'
# 'my string'.replace(' ','_') -- replace all ' ' for '_'. Return 'my_string'

**Generating fake data**

In [2]:
import pandas as pd
from faker import Faker

# Create an Faker object
fake = Faker()

# Now you have many options to genarate data:
fake.name()
fake.text()
fake.address()
fake.email()
fake.date()
fake.country()
fake.phone_number()
fake.random_number(digits=5)

# let's generate an example DataFrame
faker_df = pd.DataFrame({'date':[fake.date() for i in range(10)],
                         'name':[fake.name() for i in range(10)],
                         'email':[fake.email() for i in range(10)],
                         'text':[fake.text() for i in range(10)]})
faker_df

Unnamed: 0,date,name,email,text
0,1982-01-07,Cindy Ryan,housedavid@example.org,Several century life hold under alone. Artist ...
1,2003-08-20,Pamela Gay,paullittle@example.net,Alone relationship level size bill interest.\n...
2,1984-12-22,William Flowers,ronald94@example.org,Former its term. Well idea across old toward c...
3,1989-05-04,Dorothy Hancock,walkerjoanne@example.com,Car can nearly reason drop director population...
4,1988-10-26,John Taylor,laurajackson@example.net,Go sense expect per public picture. Firm want ...
5,1979-03-11,Kyle Gonzalez,alyssamayo@example.net,Like read child single color. Training dark de...
6,1988-11-22,Brian Davis,taylorsean@example.net,Near fall painting eat quite paper watch. Sour...
7,1977-07-14,Ryan Mcdonald,gonzalezbrian@example.org,Forget method security write seek difference b...
8,2001-08-14,Maria Andrews,weeksmelissa@example.com,When network baby than. Keep best staff whole ...
9,2010-01-25,Tonya Harris,allisonjonathan@example.org,Suggest growth us always everything situation ...


**Using f-strings like a pro**

**Loading bar**

In [3]:
import time

def any_func_with_loop(): 
    t_start = time.perf_counter()
    
    list_to_loop = [i for i in range(333)]
    percentage_counter = 0 # Used to not repeat the same percentage
    loop_counter = 0
    
    for item in list_to_loop:
        
        # Your code here
        time.sleep(0.3)
        
        # Code to track execution progress
        loop_counter+=1
        # Calculate approximate progress (%)
        percentage = round(100*(loop_counter)/len(list_to_loop),0) 
        # Print from 10% to 10%
        if percentage%10 ==0:
            # Ensures rounded % will not be printed twice
            if percentage_counter !=percentage:
                t_prov = time.perf_counter()
                time_elapsed = t_prov -t_start
                time_estimated = (100*time_elapsed/percentage)-time_elapsed
                print(f'Status: {percentage:.0f}% Completed! -> '
                      f'Elapsed time:{time_elapsed:.2f}s or {time_elapsed/60:.2f}min. | '
                      f'Estimated time: {time_estimated:.2f}s or {time_estimated/60:.2f}min')
                percentage_counter = percentage
                

In [4]:
any_func_with_loop()

Status: 10% Completed! -> Elapsed time:9.91s or 0.17min. | Estimated time: 89.22s or 1.49min
Status: 20% Completed! -> Elapsed time:20.13s or 0.34min. | Estimated time: 80.52s or 1.34min
Status: 30% Completed! -> Elapsed time:30.66s or 0.51min. | Estimated time: 71.53s or 1.19min
Status: 40% Completed! -> Elapsed time:40.88s or 0.68min. | Estimated time: 61.31s or 1.02min
Status: 50% Completed! -> Elapsed time:51.10s or 0.85min. | Estimated time: 51.10s or 0.85min
Status: 60% Completed! -> Elapsed time:61.62s or 1.03min. | Estimated time: 41.08s or 0.68min
Status: 70% Completed! -> Elapsed time:71.81s or 1.20min. | Estimated time: 30.78s or 0.51min
Status: 80% Completed! -> Elapsed time:81.96s or 1.37min. | Estimated time: 20.49s or 0.34min
Status: 90% Completed! -> Elapsed time:92.48s or 1.54min. | Estimated time: 10.28s or 0.17min
Status: 100% Completed! -> Elapsed time:102.70s or 1.71min. | Estimated time: 0.00s or 0.00min


**Function to do basic analysis in any DataFrame**

In [105]:
df_teste = pd.read_csv('credit_card_history.csv')

In [108]:
def analyse_df(df, corr_limit = 0.75):
    
    """Analyse any dataframe and print results
    * Print df Shape, duplicate rows qnt, memory usage, call DataFrame.describe()
    * Check Missing values in each columns, returning quanity and percentage 
    * Check Linear Correlation between columns, return Pearson number

    Keyword arguments:
    df -- Any DataFrame
    corr_limit -- Correlation Limit (Pearson) to define if relationship exists (default 0.75)
    """   

    print('General Info:')
    print(f'{df.shape[0]} Rows {df.shape[1]} Columns'
          f'\n{df.duplicated().sum()} Duplicated Rows'
          f'\nMemory Usage: {df.memory_usage().sum()/(1024*1024):.2f}Mb')
    display(df.describe())
    
    # Checking Missing Values in each columns
    print('\nCheking Missing Values:')
    col_with_missing_counter = 0
    for col in df.columns:
        qnt_missing = df[col].isna().sum()
        if qnt_missing > 0:
            col_with_missing_counter +=1
            print(f'Column "{col}" has {qnt_missing} missing values ({qnt_missing/df.shape[0]:.2%})')
    if col_with_missing_counter ==0 :
        print('Analyzed DataFrame has no missing values')
        
    # Checking linear correlation between columns
    print('\nChecking Linear Correlation:')
    df_corr = df.corr() # Correlation DataFrame
    ckecked_list =[] # Ensure that we won't print the same information twice
    cols_with_correlation_counter = 0
    for col in df_corr.columns:
        ckecked_list.append(col)
        for i in range(len(df_corr)):
            if ((df_corr[col][i] > corr_limit or df_corr[col][i] < -corr_limit) and
                (df_corr.index[i] not in ckecked_list)):
                
                cols_with_correlation_counter += 1
                print(f'Linear Correlation found between columns '
                      f'{df_corr.index[i]} and {col} -> Pearson coef. = {df_corr[col][i]:.2f}')         
    if cols_with_correlation_counter == 0:
        print('No linear correlation was found')

In [109]:
analyse_df(df_teste)

General Info:
30000 Rows 17 Columns
36 Duplicated Rows
Memory Usage: 3.89Mb


Unnamed: 0,limit_bal,sex,education,marriage,age,pay_1,pay_2,bill_amt1,bill_amt2,pay_amt1,pay_amt2,default_payment_next_month,age_range_int,sum_pay,mean_bill,mean_pay_amt
count,30000.0,30000.0,29532.0,29623.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,167484.322667,1.603733,1.808073,1.538906,35.4855,-0.0167,-0.133767,51223.3309,49179.075167,5663.5805,5921.163,0.2212,10.753767,-1.094633,269861.7,31651.39
std,129747.661567,0.489129,0.698643,0.498492,9.217904,1.123802,1.197186,73635.860576,71173.768783,16563.280354,23040.87,0.415062,5.957856,5.893055,379564.3,60827.68
min,10000.0,1.0,1.0,1.0,21.0,-2.0,-2.0,-165580.0,-69777.0,0.0,0.0,0.0,1.0,-12.0,-336259.0,0.0
25%,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,3558.75,2984.75,1000.0,833.0,0.0,5.0,-5.0,28688.0,6679.75
50%,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,22381.5,21200.0,2100.0,2009.0,0.0,11.0,0.0,126311.0,14383.0
75%,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,67091.0,64006.25,5006.0,5000.0,0.0,16.0,0.0,342626.5,33503.5
max,1000000.0,2.0,3.0,2.0,79.0,8.0,8.0,964511.0,983931.0,873552.0,1684259.0,1.0,20.0,36.0,5263883.0,3764066.0



Cheking Missing Values:
Column "education" has 468 missing values (1.56%)
Column "marriage" has 377 missing values (1.26%)

Checking Linear Correlation:
Linear Correlation found between columns sum_pay and pay_2 -> Pearson coef. = 0.85
Linear Correlation found between columns bill_amt2 and bill_amt1 -> Pearson coef. = 0.95
Linear Correlation found between columns mean_bill and bill_amt1 -> Pearson coef. = 0.94
Linear Correlation found between columns mean_bill and bill_amt2 -> Pearson coef. = 0.96
