In [1]:
import os

In [2]:
os.chdir("E:\\datascienceproject")
%pwd

'E:\\datascienceproject'

In [3]:
import pandas as pd
column_names = [
    "class",
    "Alcohol",
    "Malic acid",
    "Ash",
    "Alcalinity of ash",
    "Magnesium",
    "Total phenols",
    "Flavanoids",
    "Nonflavanoid phenols",
    "Proanthocyanins",
    "Color intensity",
    "Hue",
    "OD280/OD315 of diluted wines",
    "Proline"
]



data = pd.read_csv("artifacts/data_ingestion/wine.data", header=None, names=column_names)

data.head()



Unnamed: 0,class,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [17]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    schema: dict

In [18]:
from src.datascienece.constants import *
from src.datascienece.utils.common import read_yaml, create_directories


In [32]:
class ConfigurationManager:
    def __init__(self,
                 config_file_path = CONFIG_FILE_PATH,
                 params_file_path = PARAMS_FILE_PATH,
                 schema_file_path = SCHEMA_FILE_PATH):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        self.schema = read_yaml(schema_file_path)

        create_directories([self.config.artifacts_root])
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])
        data_transformation_config = DataTransformationConfig(
           root_dir=Path(config.root_dir),
           data_path=Path(config.data_path),
           schema=self.schema)
           
        return data_transformation_config

In [33]:
import os
from src.datascienece import logger 
from sklearn.model_selection import train_test_split
import pandas as pd



In [34]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
    
    def train_test_split(self):
        try:
            logger.info("Reading data from csv file")
            data = pd.read_csv(self.config.data_path, header=None, names=list(self.config.schema.COLUMNS.keys()))
            
            logger.info("Splitting data into train and test")
            train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
            
            logger.info("Saving transformed data to csv file")
            logger.info(train_set.shape)
            logger.info(test_set.shape)
            
            print(train_set.shape)
            print(test_set.shape)
            
            train_set.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
            test_set.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)
            
            logger.info(f"Transformed data saved successfully to {self.config.root_dir}")
            
        except Exception as e:
            logger.exception(f"Error occurred during data transformation: {e}")
            raise e

In [35]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_split()
except Exception as e:
    logger.exception(f"Error occurred during data transformation: {e}")
    raise e

[2026-02-24 14:04:04,224: INFO: common]: yaml file: config\config.yaml loaded successfully
[2026-02-24 14:04:04,226: INFO: common]: yaml file: parameters.yaml loaded successfully
[2026-02-24 14:04:04,228: INFO: common]: yaml file: schema.yaml loaded successfully
[2026-02-24 14:04:04,230: INFO: common]: Directory created at: artifacts
[2026-02-24 14:04:04,231: INFO: common]: Directory created at: artifacts/data_transformation
[2026-02-24 14:04:04,231: INFO: 1667704859]: Reading data from csv file
[2026-02-24 14:04:04,236: INFO: 1667704859]: Splitting data into train and test
[2026-02-24 14:04:04,244: INFO: 1667704859]: Saving transformed data to csv file
[2026-02-24 14:04:04,245: INFO: 1667704859]: (142, 14)
[2026-02-24 14:04:04,246: INFO: 1667704859]: (36, 14)
(142, 14)
(36, 14)
[2026-02-24 14:04:04,254: INFO: 1667704859]: Transformed data saved successfully to artifacts\data_transformation
