# Function Codes

In [1]:
def dataCleaning(df, code=True, tips=False, orientation=True, formatIssues=True, missingValues=True, duplicateValues=True, outliers=True):
    """
    df: your dataframe

    code: A text template to note your observations as you go. Use the code snippets included in the output. copy-paste into vscode/notepad

    tips: Provides snippets of code to help you clean potential issues in your df. If you prefer this to code
    
    orientation: Provides information about the shape/objects of your data
    
    formatIssues: Provides detailed information on each column to help identify format issues
    
    missingValues: Provides information on missing values
    
    duplicateValues: Provides information on duplicate values
    
    outliers: Provides information on outliers
    """
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns

    if code==True:
        print("### CLEANING CODE:")
        print("df = dfX #Change to your df's name")
        print()
        print("#### Change column value:")
        print()
        print()
        print("#### Drop entire column:")
        print()
        print()
        print("#### Change column type:")
        print()
        print()
        print("#### Change column name:")
        print()
        print()
        print("#### Handle missing values:")
        print()
        print()
        print("#### Handle duplicate values:")
        print("# df.drop_duplicates(inplace=True) # drop ALL duplicate rows")
        print()
        print("#### Drop outliers:")
        print()
        print()
        print("#### Other observations / further investigations:")
        print("#")
        print("#")
        print("#")
        print()
        print("df.head() #Final Review")
        print("# dfX = df #Change to your df's name")
        print()
        print("=========================================")
    
    if orientation==True:
        print("ORIENTATION")
        print(df.info())
        print("=========================================")
        print()
        
    
    if formatIssues==True:
        print("FORMAT ISSUES")
        print()
        for col in df.columns:
            if df[col].dtype == 'object' or df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'datetime64':
                print("df.rename(columns={'" + col + "': ''}, inplace=True)", "#rename column")
                print("df['" + col + "'] = df['" + col + "'].replace('old_value', 'new_value')")
                print("df['" + col + "'] = df['" + col + "'].astype('new_type') # new_type can be int64, float64, object, category, datetime64")
                print("df.drop('" + col + "', axis=1, inplace=True)")                
                pd.set_option('display.max_rows', None)
                print(df.groupby(col, sort=True).size())
                pd.reset_option('display.max_rows')
                #display the dtypes of the column
                print("Current Column DType: ", df[col].dtype, "     Do not compare with above. This one will always return int64 as it's the dtype of the count")                
                print("df['" + col + "'] = df['" + col + "'].astype('new_type') # new_type can be int64, float64, object, category, datetime64")
                print()
            #else:
            #    print(col)
            #    print(df[col].describe())
            #    print()

        if tips==True:
            print("TIPS")
            print("To make a correction to a column, use the following syntax:")
            print("df['A'] = df['A'].apply(lambda x: x.replace('old_value', 'new_value'))")
            print()
            print("To change a column data type, use the following syntax:")
            print("df['A'] = pd.to_datetime(df['A']) # for datetime")
            print("df['A'] = df['A'].astype('int64') # for integers")
            print("df['A'] = df['A'].astype('float64') # for floats")
            print("df['A'] = df['A'].astype('category') # for categorical")
            print("df['A'] = df['A'].astype('object') # for object")
            print()
        print("=========================================")
        print()

    if missingValues==True:
        print("MISSING VALUES")
        print()
        for col in df.columns:
            if df[col].isnull().sum() > 0:
                print(col, ":", df[col].isnull().sum(), " missing values")
                print("df.dropna(subset=['" + col + "'], inplace=True)")
                print("df['" + col + "'].fillna(df['" + col + "'].mean(), inplace=True) #fill NA entries with the mean")
                print("df['" + col + "'].fillna(0, inplace=True) # fill NA entries with a single value, such as zero")
                print()
                print(df.loc[df[col].isnull()].head())
                print()
            else:
                print(col, ": No missing values")
                print()
                                    
        if tips==True:
            print()
            print("TIPS")
            print("You can drop rows with missing values using one of the following code:")
            print("df.dropna(subset=['col'], inplace=True) #For a single column")
            print("df.dropna(inplace=True) #For all columns")
            print()
            print("You can fill rows with missing values using one of the following code:")
            print("df['col'].fillna(df['col'].mean(), inplace=True) #fill NA entries with the mean")
            print("df['col'].fillna(0, inplace=True) # fill NA entries with a single value, such as zero")
            print("df['col'].fillna(method='ffill') # forward-fill to propagate the previous value forward")
            print("df['col'].fillna(method='bfill' # back-fill to propagate the next values backward)")
            print()
            print("To view them:")
            print("df.loc[df[col].isnull()].head()")
            print()
        print("=========================================")
        print()

    if duplicateValues==True:
        print("DUPLICATE VALUES")
        print()
        print(df[df.duplicated()].head())
        print()

        if tips==True:
            print("TIPS")
            print("You can drop duplicate rows using the following code:")
            print("df.drop_duplicates(inplace=True)")
            print("df.drop_duplicates(subset=['col'], inplace=True) #For a single column")
            print()
            print("To view them:")
            print("df[df.duplicated()].head()")
            print()
    
        print("=========================================")
        print()

    if outliers==True:
        print("OUTLIERS")
        print()
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                print(col)
                print("-----")
                print("Outlier(s):")
                print("Below ", df[col].mean() - 3*df[col].std(), " -> ", df[df[col] < df[col].mean() - 3*df[col].std()].shape[0], " low outlier(s)")
                print("Above ", df[col].mean() + 3*df[col].std(), " -> ", df[df[col] > df[col].mean() + 3*df[col].std()].shape[0], " high outlier(s)")
                low = df[col].mean() - 3*df[col].std()
                high = df[col].mean() + 3*df[col].std()
                print("df = df[(df['" + col + "'] > " + str(low) + ") & (df['" + col + "'] < " + str(high) + ")]")
                print()
                print(df[col].describe())
                print()
                print("Boxplot")
                sns.boxplot(df[col])
                plt.show()
                print()
                print("Histogram")
                sns.histplot(df[col])
                plt.show()
                print("=========================================")
                print()

        if tips==True:
            print("TIPS")
            print("You can drop outliers using the following code:")
            print("df = df[(df['column'] > lower_bound) & (df['column'] < upper_bound)]")
            print()

# Import

# Clean

In [None]:
dataCleaning(df)

# Export to CSV