In [9]:
import pandas as pd
import json
import os
import re

In [10]:
def load_data(file_path, encoding='utf-8'):
    """
    Load data from a specified file path. Supports CSV, Excel, JSON, and Parquet files.
    
    Parameters:
    - file_path (str): Path to the file to be loaded.
    - encoding (str): Encoding to be used for reading the file. Default is 'utf-8'.
    
    Returns:
    - dict or DataFrame: A dictionary of DataFrames with keys being sheet names for Excel files,
                         or a single DataFrame for other file types.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file at {file_path} does not exist.")
    
    file_extension = os.path.splitext(file_path)[1].lower()
    
    try:
        if file_extension == '.csv':
            data = pd.read_csv(file_path, encoding=encoding)
            data.columns = data.columns.str.lower()
        elif file_extension in ['.xls', '.xlsx']:
            data = pd.read_excel(file_path, sheet_name=None)
        elif file_extension == '.json':
            with open(file_path, 'r', encoding=encoding) as f:
                json_data = json.load(f)
            data = pd.json_normalize(json_data)
            data.columns = data.columns.str.lower()
        elif file_extension == '.parquet':
            data = pd.read_parquet(file_path)
        else:
            raise ValueError(f"Unsupported file extension: {file_extension}")
    except UnicodeDecodeError as e:
        raise ValueError(f"Error loading the file due to encoding issues: {e}")
    except Exception as e:
        raise ValueError(f"Error loading the file: {e}")
    
    return data

def sanitize_sheet_name(sheet_name):
    """
    Sanitize the sheet name to be a valid Python variable name and convert to lowercase.
    
    Parameters:
    - sheet_name (str): Original sheet name.
    
    Returns:
    - str: Sanitized sheet name.
    """
    return re.sub(r'\W|^(?=\d)', '_', sheet_name.lower())

In [24]:
def check_df(dataframe, head=5):
    """
    Function to check basic characteristics of a DataFrame.

    Parameters:
    ----------
    - dataframe: pandas.core.DataFrame
        DataFrame to be checked.
    - head: int
        Number of rows to display from the beginning and end of the DataFrame.

    Returns:
    -------
    None

    Example Usage:
    --------------
    check_df(df, head=10)
    """
    print("################## Shape ####################")
    print(dataframe.shape)
    print("################## Types ####################")
    print(dataframe.dtypes)
    print("################## Head ####################")
    display(dataframe.head(head))
    print("################## Tail ####################")
    display(dataframe.tail(head))
    print("################## NA ####################")
    print(dataframe.isnull().sum())
    print("################## Quantiles ####################")
    display(dataframe.describe([0.05, 0.10, 0.25, 0.40, 0.50, 0.60, 0.75, 0.90, 0.95]).T)

In [25]:
path = r"D:\College\Academics\Extra\Datasets\daily-total-female-births.csv"
# path = r"D:\College\Academics\Extra\Datasets\test.xlsx"
# path = r"D:\College\Academics\SEM 4\New Generation Database\Datasets\playstore.json"

# Helper Code

In [26]:
#reading csv or json
df = load_data(path)
display(df.head())

# excel_data = load_data(path)
# for sheet_name, df in excel_data.items():
#     sanitized_name = sanitize_sheet_name(sheet_name)
#     globals()[sanitized_name] = df
#     print(f"DataFrame '{sanitized_name}' created:")
#     display(globals()[sanitized_name].head())

Unnamed: 0,date,births
0,1959-01-01,35
1,1959-01-02,32
2,1959-01-03,30
3,1959-01-04,31
4,1959-01-05,44


In [28]:
check_df(df)

################## Shape ####################
(365, 2)
################## Types ####################
date      object
births     int64
dtype: object
################## Head ####################


Unnamed: 0,date,births
0,1959-01-01,35
1,1959-01-02,32
2,1959-01-03,30
3,1959-01-04,31
4,1959-01-05,44


################## Tail ####################


Unnamed: 0,date,births
360,1959-12-27,37
361,1959-12-28,52
362,1959-12-29,48
363,1959-12-30,55
364,1959-12-31,50


################## NA ####################
date      0
births    0
dtype: int64
################## Quantiles ####################


Unnamed: 0,count,mean,std,min,5%,10%,25%,40%,50%,60%,75%,90%,95%,max
births,365.0,41.980822,7.348257,23.0,31.0,33.4,37.0,40.0,42.0,44.0,46.0,51.6,55.0,73.0
