In [5]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [6]:
TICKER = "AAPL"
PERIOD = "1y"
data = yf.download(TICKER, period=PERIOD, interval="1d", auto_adjust=False)
data.reset_index(inplace=True)
data.rename(columns={
    "Date": "Date",
    "Open": "Open",
    "High": "High",
    "Low": "Low",
    "Close": "Close",
    "Adj Close": "Adj Close",
    "Volume": "Volume"
}, inplace=True)

[*********************100%***********************]  1 of 1 completed


In [7]:
data.head()

Price,Date,Adj Close,Close,High,Low,Open,Volume
Ticker,Unnamed: 1_level_1,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL
0,2024-11-07,226.426193,227.479996,227.880005,224.570007,224.630005,42137700
1,2024-11-08,226.157166,226.960007,228.660004,226.410004,227.169998,38328800
2,2024-11-11,223.436813,224.229996,225.699997,221.5,225.0,42005600
3,2024-11-12,223.436813,224.229996,225.589996,223.360001,224.550003,40398300
4,2024-11-13,224.323654,225.119995,226.649994,222.759995,224.009995,48566200


In [8]:
def clean_financial_data(df):
    """Convert Date Format"""
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

    """Remove Duplicates"""
    df['Date'] = df['Date'].drop_duplicates()

    """Missing Values"""
    df = df.ffill().bfill()
    df = df.dropna()

    """Inconsistent Data Detection"""
    bad_rows = (df["High"] < df["Low"]) | (df["High"] < df["Open"]) | (df["High"] < df["Close"])
    if bad_rows.sum() > 0:
        print(f"⚠️ Found {bad_rows.sum()} inconsistent rows. Correcting...")
        df.loc[bad_rows, ["Open", "High", "Low", "Close"]] = np.nan
        df = df.ffill().bfill()

    """Return Rate"""
    df["Return"] = df["Adj Close"].pct_change()
    df["LogReturn"] = np.log(df["Adj Close"] / df["Adj Close"].shift(1))

    """Normalization"""
    scaler = MinMaxScaler()
    cols_to_scale = ["Open", "High", "Low", "Close", "Adj Close", "Volume"]
    scaled = scaler.fit_transform(df[cols_to_scale])
    scaled_df = pd.DataFrame(scaled, columns=[c + "_Norm" for c in cols_to_scale])
    df = pd.concat([df.reset_index(drop=True), scaled_df], axis=1)

    """"Outliers"""
    z_score = (df["Return"] - df["Return"].mean()) / df["Return"].std()
    outliers = z_score.abs() > 5
    df.loc[outliers, "Return"] = np.nan
    df["Return"] = df["Return"].ffill().bfill()

In [9]:
def quality_report(df):
    report = {
        "total_rows": len(df),
        "missing_ratio": df.isna().mean().round(3).to_dict(),
        "duplicates": df.duplicated(subset=["Date"]).sum(),
        "min_date": str(df["Date"].min().date()),
        "max_date": str(df["Date"].max().date())
    }
    return report