In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# create a sample dataframe
data = {'Name': ['John', 'Mary', 'Mike', 'John'], 'Age': [25, 30, 35, 25], 'Salary': [50000, 60000, 70000, 50000]}
df = pd.DataFrame(data)

# define a function that takes a dataframe and returns the mean of each column
def get_mean(df):
    means = []
    for col in df.columns:
        means.append(df[col].mean())
    return means

# define a function that takes a dataframe and returns the range of each column
def get_range(df):
    ranges = []
    for col in df.columns:
        col_min = df[col].min()
        col_max = df[col].max()
        col_range = col_max - col_min
        ranges.append(col_range)
    return ranges

# define a function that takes a dataframe and returns the standard deviation of each column
def get_sdev(df):
    sdevs = []
    for col in df.columns:
        sdevs.append(df[col].std())
    return sdevs

# define a function that takes a dataframe and returns the value counts for each column
def get_value_counts(df):
    value_counts = {}
    for col in df.columns:
        value_counts[col] = df[col].value_counts()
    return value_counts

# define a function that takes a dataframe and a threshold and returns the number of outliers for each column
def count_outliers(df, threshold=0.005):
    outliers = {}
    for col in df.columns:
        col_mean = df[col].mean()
        col_std = df[col].std()
        lower_bound = col_mean - threshold * col_std
        upper_bound = col_mean + threshold * col_std
        num_outliers = np.sum((df[col] < lower_bound) | (df[col] > upper_bound))
        outliers[col] = num_outliers
    return outliers


# define a function that takes a dataframe and plots a distribution plot for each column
def plot_distributions(df):
    for col in df.columns:
        sns.histplot(df[col])
    return

# define a function that takes a dataframe and plots a bar chart for each unique value of a string column
def plot_categorical(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            sns.countplot(x=col, data=df)
            plt.show()
    return

# define a function that takes a dataframe and returns the data types of each column
def get_dtypes(df):
    dtypes = {}
    for col in df.columns:
        dtypes[col] = df[col].dtype
    return dtypes





######CHECK
# define a function that takes a dataframe and returns th
def identify_and_remove_duplicates(dataframe):
    # Identify duplicates
    duplicate_rows = dataframe.duplicated()

    # Count the number of duplicates
    num_duplicates = duplicate_rows.sum()

    # Print the number of duplicates
    print(f'Number of duplicates: {num_duplicates}')

    # Remove duplicates
    dataframe.drop_duplicates(inplace=True)

    # Confirm removal of duplicates
    num_rows = dataframe.shape[0]
    num_unique_rows = dataframe.drop_duplicates().shape[0]

    if num_rows == num_unique_rows:
        print('Duplicates removed successfully')
    else:
        print('Error: duplicates not removed')

    return dataframe





def snake_case_columns(dataframe):
    # Convert column names to snake case
    dataframe.columns = dataframe.columns.str.lower().str.replace(' ', '_')
    return dataframe