LOAD LIBRARIES

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import trim_mean
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings("ignore")

Load CSV function

In [2]:
def csv_imported(x, t= None):
    return pd.read_csv(x, sep = t)

Summary function

In [3]:
def data_summary(y, c = None, d = None, e = None):
    operations = ['corr_matrix', 'describe', 'dtypes', 'head', 'histogram', 'info', 'nnan', 'ncolumns', 'nrows', 'nunique', 
    'outliers_count', 'pcounts', 'skew_kurt', 'trim_mean', 'vcounts', 'all', 'options']
    
    if d == 'options':
        print(operations)
        
    if isinstance(d, str):
        d = [d]
    
    for operation in d:
        if operation not in operations:
            print(f'Invalid operation: {operation}. Choose one of: {operations}')
        if operation =='nrows' or operation == 'all':
            print('Rows: ' + str(y.shape[0]))
        if operation =='ncolumns' or operation == 'all':
            print('Columns: ' + str(y.shape[1]))
        if operation == 'nunique' or operation == 'all':
            print('Nº unique values:')
            if c is None:
                print(y.nunique())
            else:
                print(y[c].nunique())
        if operation == 'dtypes' or operation == 'all':
            print('Datatypes:')
            print(y.dtypes)
        if operation == 'nnan' or operation == 'all':
            print('Nº of NaN:')
            if c is None:
                print(y.isna().sum())
            else:
                print(y[c].isna().sum())   
        if operation == 'head' or operation == 'all':
            print(y.head())
        if operation == 'info' or operation == 'all':
            print('Information:')
            y.info()
        if operation == 'vcounts' and c is not None:
            print('Value counts:')
            print(y[c].value_counts())
        if operation == 'pcounts' and c is not None:
            print('Proportions counts:')
            print(y[c].value_counts(normalize = True))
        if operation == 'describe' or operation == 'all':
            print('Describe:')
            print(y.describe(include = 'all'))
        if operation == 'trim_mean':
            print('Trim mean:')
            print(trim_mean(y[c], proportiontocut = e))
        if operation == 'corr_matrix' or operation == 'all':
            print('Correlation Matrix:')
            correlation_matrix = y.corr()
            plt.figure(figsize=(6, 4))
            sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", cbar=True)
            plt.show()
        if operation == 'skew_kurt' or operation == 'all':
            print('Skewness:')
            print(y.skew(numeric_only=True))
            print('')
            print('Kurtosis:')
            print(y.kurt(numeric_only=True))
        if operation == 'outliers_count' or operation == 'all':
            print('Outliers count:')
            Q1 = y.quantile(0.25)
            Q3 = y.quantile(0.75)
            IQR = Q3 - Q1
            outliers = ((y < (Q1 - 1.5 * IQR)) | (y > (Q3 + 1.5 * IQR))).sum()
            print(outliers)
        if operation == 'histogram' or operation == 'all':
            print('Histograms:')
            y.hist(figsize=(10, 6))
            plt.show()


Cleaner function

In [4]:
def cleaner(y, c= None, d = None, e = None, new = None, old = None):
    operations = ['datetime', 'dropcol', 'dropna', 'dtype', 'fillna', 'lower', 'rename', 'replace', 'scale', 'standard', 'upper', 'options']

    y = y.drop_duplicates()
    y = y.loc[:,y.isnull().mean() <= 0.5]
    if d == 'options':
        print(operations)
        
    if isinstance(d, str):
        d = [d]
    
    for operation in d:
        if operation not in operations:
            print(f'Invalid operation: {operation}. Choose one of: {operations}')
        if operation =='dropcol' and c is not None:
            y.drop(columns = c, inplace = True)
        if operation =='upper' and c is not None:
            y[c] = y[c].str.upper()
        if operation =='lower' and c is not None:
            y[c] = y[c].str.lower()
        if operation =='fillna' and c is not None:
            if e == 'mode':
                y[c].fillna(y[c].mode()[0], inplace = True)
            elif e =='mean':
                y[c].fillna(y[c].mean(), inplace = True)
            elif e =='median':
                y[c].fillna(y[c].median(), inplace = True)
            else:
                y[c].fillna(e, inplace = True)
        if operation =='dropna':
            y.dropna(inplace = True)
        if operation =='standard' and c is not None:
            y[c] = (y[c] - y[c].mean()) / y[c].std()
        if operation =='scale' and c is not None:
            y[c] = (y[c] - y[c].min()) / (y[c].max() - y[c].min())
        if operation =='rename' and c is not None and e is not None:
            y.rename(columns = {c: e}, inplace = True)
        if operation =='dtype' and c is not None and e is not None:
            y[c] = y[c].astype(e)
        if operation =='datetime' and c is not None:
            pd.to_datetime(y[c])
        if operation =='replace' and c is not None:
            y[c].replace(old, new, inplace = True)
        if operation == 'outlier' and c is not None:
            q1 = y[c].quantile(0.25)
            q3 = y[c].quantile(0.75)
            iqr = q3 - q1
            y = y[(y[c] >= (q1 - 1.5 * iqr)) & (y[c] <= (q3 + 1.5 * iqr))]
        if operation == 'bin' and c is not None and e is not None:
            y[c] = pd.cut(y[c], bins = e, labels =  False)
        if operation == 'sort' and c is not None and e is not None:
            y.sort_values(by = c, ascending = e, inplace = True)
        if operation == 'reorder' and c is not None:
            y = y[c + [col for col in y.columns if col not in c]]
    return y



SAVE CSV function

In [5]:
def csv_saving(data, x):
    df = pd.DataFrame(data)
    df.to_csv(x, index=False)