In [1]:
import pandas as pd

df = pd.read_parquet('mf_historical_clean.parquet')
df

Unnamed: 0,schemeCode,schemeName,fund_house,scheme_type,scheme_category,date,nav
0,149194,SBI Fixed Maturity Plan (FMP) - Series 51 (184...,SBI Mutual Fund,Mutual Fund,IDF/Growth/Liquid,2024-08-08,11.6690
1,148758,Mirae Asset Corporate Bond Fund Regular IDCW,Mirae Asset Mutual Fund,Open Ended Scheme,Debt Scheme,2024-08-08,11.8699
2,151300,Axis CRISIL IBX 50:50 Gilt Plus SDL June 2028 ...,Axis Mutual Fund,Open Ended Scheme,Other Scheme,2024-08-08,11.2441
3,105025,Invesco India Low Duration Fund - Daily IDCW (...,Invesco Mutual Fund,Open Ended Scheme,Debt Scheme,2024-08-08,1017.9524
4,146188,BANDHAN Overnight Fund - Regular Plan - Daily ...,Bandhan Mutual Fund,Open Ended Scheme,Debt Scheme,2024-08-08,1000.0073
...,...,...,...,...,...,...,...
30962904,100891,Principal Cash Management-Liquid Option-Instit...,PRINCIPAL Mutual Fund,Open Ended Scheme,Uncategorized,2006-04-01,10.0064
30962905,100888,Principal Cash Management-Liquid Option-Growth...,PRINCIPAL Mutual Fund,Open Ended Scheme,Uncategorized,2006-04-01,13.7801
30962906,102913,CANARA ROBECO SAVINGS FUND - REGULAR PLAN - GR...,Canara Robeco Mutual Fund,Open Ended Scheme,Debt Scheme,2006-04-01,10.6652
30962907,102010,UTI Liquid Fund - Regular Plan - Daily IDCW (R...,UTI Mutual Fund,Mutual Fund,Uncategorized,2006-04-01,1017.7652


In [2]:
def verify_data_types(df):
    try:
        data_types = df.dtypes
        print("Data Types Verified.")
        return True
    except Exception as e:
        print(f"Data Type Verification Failed: {e}")
        return False

verify_data_types(df)

Data Types Verified.


True

In [3]:
def check_missing_values(df):
    missing_values = df.isnull().sum()
    if missing_values.any():
        print("Missing Values Found.")
        return False
    else:
        print("No Missing Values.")
        return True

check_missing_values(df)

No Missing Values.


True

In [4]:
def check_duplicates(df):
    duplicates = df.duplicated().sum()
    if duplicates > 0:
        print(f"Duplicates Found: {duplicates}")
        return False
    else:
        print("No Duplicates Found.")
        return True

check_duplicates(df)

No Duplicates Found.


True

In [5]:
def summary_statistics(df):
    try:
        summary_stats = df.describe(include='all')
        print("Summary Statistics Checked.")
        return True
    except Exception as e:
        print(f"Summary Statistics Check Failed: {e}")
        return False

summary_statistics(df)

Summary Statistics Checked.


True

In [6]:
def categorical_data_review(df):
    try:
        df.select_dtypes(include=['object']).apply(pd.Series.value_counts)
        print("Categorical Data Review Passed.")
        return True
    except Exception as e:
        print(f"Categorical Data Review Failed: {e}")
        return False

categorical_data_review(df)

Categorical Data Review Passed.


True

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [8]:
def trend_analysis(df, output_folder):
    try:
        if 'date' in df.columns:
            df['date'] = pd.to_datetime(df['date'])
            trends = df.groupby(df['date'].dt.to_period("M"))['nav'].mean()
            trends.plot()
            plt.title("Trend Analysis")
            plt.savefig(os.path.join(output_folder, "trend_analysis.png"))
            plt.close()
            print("Trend Analysis Completed.")
            return True
        else:
            print("No 'date' column found for trend analysis.")
            return False
    except Exception as e:
        print(f"Trend Analysis Failed: {e}")
        return False

output_folder = './'
trend_analysis(df, output_folder)

Trend Analysis Completed.


True

In [9]:
import pandas as pd

def export_quality_report(df, report, output_csv):
    report_df = pd.DataFrame.from_dict(report, orient='index')
    report_df.to_csv(output_csv)
    print(f"Quality check report exported to {output_csv}")

In [11]:
def main(parquet_file_path, output_folder):
    df = pd.read_parquet('mf_historical_clean.parquet')
    
    if df is not None:
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        
        report = {}
        report['Data Types'] = verify_data_types(df)
        report['Missing Values'] = check_missing_values(df)
        report['Duplicates'] = check_duplicates(df)
        report['Summary Statistics'] = summary_statistics(df)
        report['Categorical Data Review'] = categorical_data_review(df)
        report['Trend Analysis'] = trend_analysis(df, output_folder)

        # Export report to CSV
        output_csv = os.path.join(output_folder, 'quality_check_report.csv')
        export_quality_report(df, report, output_csv)

if __name__ == "__main__":
    parquet_file_path = 'mf_historical_clean.parquet'
    output_folder = './'
    main(parquet_file_path, output_folder)

Data Types Verified.
No Missing Values.
No Duplicates Found.
Summary Statistics Checked.
Categorical Data Review Passed.
Trend Analysis Completed.
Quality check report exported to ./quality_check_report.csv
