In [2]:
import os

In [3]:
os.chdir("E:\\datascienceproject")
%pwd

'E:\\datascienceproject'

In [4]:
import pandas as pd
column_names = [
    "class",
    "Alcohol",
    "Malic acid",
    "Ash",
    "Alcalinity of ash",
    "Magnesium",
    "Total phenols",
    "Flavanoids",
    "Nonflavanoid phenols",
    "Proanthocyanins",
    "Color intensity",
    "Hue",
    "OD280/OD315 of diluted wines",
    "Proline"
]



data = pd.read_csv("artifacts/data_ingestion/wine.data", header=None, names=column_names)

data.head()



Unnamed: 0,class,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   class                         178 non-null    int64  
 1   Alcohol                       178 non-null    float64
 2   Malic acid                    178 non-null    float64
 3   Ash                           178 non-null    float64
 4   Alcalinity of ash             178 non-null    float64
 5   Magnesium                     178 non-null    int64  
 6   Total phenols                 178 non-null    float64
 7   Flavanoids                    178 non-null    float64
 8   Nonflavanoid phenols          178 non-null    float64
 9   Proanthocyanins               178 non-null    float64
 10  Color intensity               178 non-null    float64
 11  Hue                           178 non-null    float64
 12  OD280/OD315 of diluted wines  178 non-null    float64
 13  Proli

In [7]:
data.isnull().sum()

class                           0
Alcohol                         0
Malic acid                      0
Ash                             0
Alcalinity of ash               0
Magnesium                       0
Total phenols                   0
Flavanoids                      0
Nonflavanoid phenols            0
Proanthocyanins                 0
Color intensity                 0
Hue                             0
OD280/OD315 of diluted wines    0
Proline                         0
dtype: int64

In [8]:
data.shape

(178, 14)

In [9]:
from dataclasses import dataclass 
from pathlib import Path  
@dataclass
class DataValidationConfig:
    root_dir:Path
    STATUS_FILE:str
    unzip_data_dir:Path
    all_schema:dict

In [10]:
from src.datascienece.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH, SCHEMA_FILE_PATH
from src.datascienece.utils.common import read_yaml, create_directories

In [11]:
class ConfigurationManager:
    def __init__(self,
                 config_file_path = CONFIG_FILE_PATH,
                 params_file_path = PARAMS_FILE_PATH,
                 schema_file_path = SCHEMA_FILE_PATH):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        self.schema = read_yaml(schema_file_path)

        create_directories([self.config.artifacts_root])
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema =self.schema.COLUMNS
        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir = config.root_dir,
            STATUS_FILE = config.STATUS_FILE,
            unzip_data_dir = config.unzip_data_dir,
            all_schema=schema 
        )

        return data_validation_config
        

In [13]:
import os   
from src.datascienece import logger


In [25]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validation_all_columns(self) -> bool:
        try:
            data = pd.read_csv(self.config.unzip_data_dir, header=None, names=self.config.all_schema.keys())
            all_cols = list(data.columns)
            all_schema = self.config.all_schema.keys()

            validation_status = True

            if len(all_cols) != len(all_schema):
                validation_status = False
            else:
                for col in all_cols:
                    if col not in all_schema:
                        validation_status = False
                        break

            with open(self.config.STATUS_FILE, "w") as f:
                f.write(f"validation_status: {validation_status}")
            
            return validation_status

        except Exception as e:
            with open(self.config.STATUS_FILE, "w") as f:
                f.write(f"validation_status: False")
            raise e

In [26]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validation_all_columns()
except Exception as e:
    raise e

[2026-02-23 12:05:26,205: INFO: common]: yaml file: config\config.yaml loaded successfully
[2026-02-23 12:05:26,207: INFO: common]: yaml file: parameters.yaml loaded successfully
[2026-02-23 12:05:26,210: INFO: common]: yaml file: schema.yaml loaded successfully
[2026-02-23 12:05:26,211: INFO: common]: Directory created at: artifacts
[2026-02-23 12:05:26,213: INFO: common]: Directory created at: artifacts/data_validation
