In [1]:
%pwd

'c:\\Users\\91889\\OneDrive\\Desktop\\Diamond-Price-Prediction\\research'

In [2]:
import os

os.chdir('../')

%pwd

'c:\\Users\\91889\\OneDrive\\Desktop\\Diamond-Price-Prediction'

In [3]:
import pandas as pd

df = pd.read_csv("artifacts\data_ingestion\diamonds.csv")

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


Update `schema.yaml` with `columns` as key and `data types` as values (In same order)

In [6]:
# target
print(f"Target: 'price', with a datatype of {df['price'].dtype}")

Target: 'price', with a datatype of int64


Update `schema.yaml` with `name` as key and `price` (target) as value

### Config Entity

In [14]:
from dataclasses import dataclass
from pathlib import Path

# Data Validation
@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    unzip_data_dir: Path
    STATUS_FILE: Path
    all_schema: dict

### Configuration Manager

In [15]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [16]:
class ConfigurationManager:
    # pull yaml file paths from constants
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        # read yaml files
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        # create a list of directories
        create_directories([self.config.artifacts_root])
    
    def get_data_validation_config(self) -> DataValidationConfig:
        # config file
        config = self.config.data_validation

        # schema file
        schema = self.schema.COLUMNS

        # create data validation directory
        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            # config
            root_dir = config.root_dir,
            unzip_data_dir = config.unzip_data_dir,
            STATUS_FILE = config.STATUS_FILE,

            # schema
            all_schema = schema
        )
        
        return data_validation_config

### Components - Data Validation

In [17]:
import os
from mlProject import logger
import pandas as pd

In [26]:
class DataValiadtion:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_all_columns(self) -> bool:
        try:
            # status
            validation_status = True # assume

            # read csv file
            data = pd.read_csv(self.config.unzip_data_dir)

            # save columns in a list
            all_cols = list(data.columns)

            # schema columns
            all_schema = self.config.all_schema

            for col in all_cols:
                # Check if column exists
                if col not in all_schema:
                    validation_status = False
                    logger.info(f"Column '{col}' is missing")
                    break
                
                # Check if data type matches the schema
                elif str(data[col].dtype) != all_schema[col]:
                    validation_status = False
                    logger.info(f"Data type mismatch. Column: {col}\nExpected: {all_schema[col]}, Got: {data[col].dtype}")
                    break
            
            # Save the validation result
            with open(self.config.STATUS_FILE, "w") as f:
                f.write(f"Validation Status: {validation_status}")

            return validation_status

        except Exception as e:
            raise e

### Pipeline

In [28]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValiadtion(config=data_validation_config)

    data_validation.validate_all_columns()

except Exception as e:
    raise e

[2024-10-19 20:07:13,215: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-10-19 20:07:13,219: INFO: common: yaml file: params.yaml loaded successfully]
[2024-10-19 20:07:13,225: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-10-19 20:07:13,229: INFO: common: created directory at: artifacts]
[2024-10-19 20:07:13,231: INFO: common: created directory at: artifacts/data_validation]
