In [1]:
import os

In [2]:
os.chdir("../")

In [3]:
import pandas as pd
from IPython.display import display, HTML
import numpy as np

In [4]:
def create_scrollable_table(df, table_id, title):
    html = f'<h3>{title}</h3>'
    html += f'<div id="{table_id}" style="height:200px; overflow:auto;">'
    html += df.to_html()
    html += '</div>'
    return html

In [5]:
df = pd.read_csv("artifacts/data_ingestion/cars.csv")
df.head()

Unnamed: 0,Make,Model,Price,Year,Kilometer,Fuel Type,Transmission,Color,Owner,Seller Type,Engine,Max Power,Max Torque,Drivetrain,Length,Width,Height,Seating Capacity,Fuel Tank Capacity
0,Honda,Amaze 1.2 VX i-VTEC,505000,2017,87150,Petrol,Manual,Grey,First,Corporate,1198 cc,87 bhp @ 6000 rpm,109 Nm @ 4500 rpm,FWD,3990.0,1680.0,1505.0,5.0,35.0
1,Maruti Suzuki,Swift DZire VDI,450000,2014,75000,Diesel,Manual,White,Second,Individual,1248 cc,74 bhp @ 4000 rpm,190 Nm @ 2000 rpm,FWD,3995.0,1695.0,1555.0,5.0,42.0
2,Hyundai,i10 Magna 1.2 Kappa2,220000,2011,67000,Petrol,Manual,Maroon,First,Individual,1197 cc,79 bhp @ 6000 rpm,113 Nm @ 4000 rpm,FWD,3585.0,1595.0,1550.0,5.0,35.0
3,Toyota,Glanza G,799000,2019,37500,Petrol,Manual,Red,First,Individual,1197 cc,82 bhp @ 6000 rpm,113 Nm @ 4200 rpm,FWD,3995.0,1745.0,1510.0,5.0,37.0
4,Toyota,Innova 2.4 VX 7 STR [2016-2020],1950000,2018,69000,Diesel,Manual,Grey,First,Individual,2393 cc,148 bhp @ 3400 rpm,343 Nm @ 1400 rpm,RWD,4735.0,1830.0,1795.0,7.0,55.0


In [6]:
df.columns

Index(['Make', 'Model', 'Price', 'Year', 'Kilometer', 'Fuel Type',
       'Transmission', 'Color', 'Owner', 'Seller Type', 'Engine', 'Max Power',
       'Max Torque', 'Drivetrain', 'Length', 'Width', 'Height',
       'Seating Capacity', 'Fuel Tank Capacity'],
      dtype='object')

In [7]:
df.shape

(2041, 19)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2041 entries, 0 to 2040
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Make                2041 non-null   object 
 1   Model               2041 non-null   object 
 2   Price               2041 non-null   int64  
 3   Year                2041 non-null   int64  
 4   Kilometer           2041 non-null   int64  
 5   Fuel Type           2041 non-null   object 
 6   Transmission        2041 non-null   object 
 7   Color               2041 non-null   object 
 8   Owner               2041 non-null   object 
 9   Seller Type         2041 non-null   object 
 10  Engine              1961 non-null   object 
 11  Max Power           1961 non-null   object 
 12  Max Torque          1961 non-null   object 
 13  Drivetrain          1907 non-null   object 
 14  Length              1977 non-null   float64
 15  Width               1977 non-null   float64
 16  Height

In [9]:
df.duplicated().sum()

4

In [10]:
df.duplicated().sum()

4

In [11]:
df.nunique()

Make                    33
Model                 1040
Price                  618
Year                    22
Kilometer              838
Fuel Type                9
Transmission             2
Color                   17
Owner                    6
Seller Type              3
Engine                 106
Max Power              331
Max Torque             283
Drivetrain               3
Length                 247
Width                  169
Height                 195
Seating Capacity         6
Fuel Tank Capacity      55
dtype: int64

In [12]:
# Summary statistics for numerical features
numerical_features = df.select_dtypes(include=[np.number])
summary_stats = numerical_features.describe().T
html_numerical = create_scrollable_table(summary_stats, 'numerical_features', 'Summary statistics for numerical features')

display(HTML(html_numerical))

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Price,2041.0,1710966.0,2427077.0,49000.0,490000.0,840000.0,1950000.0,35000000.0
Year,2041.0,2016.452,3.354949,1988.0,2014.0,2017.0,2019.0,2022.0
Kilometer,2041.0,54204.75,57531.24,0.0,29000.0,50000.0,72000.0,2000000.0
Length,1977.0,4283.962,442.4595,3099.0,3985.0,4370.0,4630.0,5569.0
Width,1977.0,1768.734,135.4421,1475.0,1695.0,1775.0,1835.0,2220.0
Height,1977.0,1592.442,136.3736,1165.0,1485.0,1545.0,1678.0,1995.0
Seating Capacity,1977.0,5.310066,0.8243909,2.0,5.0,5.0,5.0,8.0
Fuel Tank Capacity,1928.0,52.08729,15.14658,15.0,41.75,50.0,60.0,105.0


In [13]:
# Summary statistics for categorical features
categorical_features = df.select_dtypes(include=[object])
cat_summary_stats = categorical_features.describe().T
html_categorical = create_scrollable_table(cat_summary_stats, 'categorical_features', 'Summary statistics for categorical features')

display(HTML(html_categorical ))

Unnamed: 0,count,unique,top,freq
Make,2041,33,Maruti Suzuki,439
Model,2041,1040,X1 sDrive20d xLine,15
Fuel Type,2041,9,Diesel,1044
Transmission,2041,2,Manual,1119
Color,2041,17,White,794
Owner,2041,6,First,1605
Seller Type,2041,3,Individual,1979
Engine,1961,106,1197 cc,227
Max Power,1961,331,89 bhp @ 4000 rpm,87
Max Torque,1961,283,200 Nm @ 1750 rpm,90


In [14]:
# Null values in the dataset
null_values = df.isnull().sum()
html_null_values = create_scrollable_table(null_values.to_frame(), 'null_values', 'Null values in the dataset')

# Percentage of missing values for each feature
missing_percentage = (df.isnull().sum() / len(df)) * 100
html_missing_percentage = create_scrollable_table(missing_percentage.to_frame(), 'missing_percentage', 'Percentage of missing values for each feature')

display(HTML(html_null_values + html_missing_percentage))

Unnamed: 0,0
Make,0
Model,0
Price,0
Year,0
Kilometer,0
Fuel Type,0
Transmission,0
Color,0
Owner,0
Seller Type,0

Unnamed: 0,0
Make,0.0
Model,0.0
Price,0.0
Year,0.0
Kilometer,0.0
Fuel Type,0.0
Transmission,0.0
Color,0.0
Owner,0.0
Seller Type,0.0


In [15]:
num_features = df.select_dtypes(exclude="object").columns
cat_features = df.select_dtypes(include="object").columns

In [16]:
df[num_features] = df[num_features].fillna(df[num_features].mean())
df[cat_features] = df[cat_features].fillna(df[cat_features].mode().iloc[0])

In [17]:
# Null values in the dataset
null_values = df.isnull().sum()
html_null_values = create_scrollable_table(null_values.to_frame(), 'null_values', 'Null values in the dataset')

# Percentage of missing values for each feature
missing_percentage = (df.isnull().sum() / len(df)) * 100
html_missing_percentage = create_scrollable_table(missing_percentage.to_frame(), 'missing_percentage', 'Percentage of missing values for each feature')

display(HTML(html_null_values + html_missing_percentage))

Unnamed: 0,0
Make,0
Model,0
Price,0
Year,0
Kilometer,0
Fuel Type,0
Transmission,0
Color,0
Owner,0
Seller Type,0

Unnamed: 0,0
Make,0.0
Model,0.0
Price,0.0
Year,0.0
Kilometer,0.0
Fuel Type,0.0
Transmission,0.0
Color,0.0
Owner,0.0
Seller Type,0.0


In [18]:
data = df

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2041 entries, 0 to 2040
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Make                2041 non-null   object 
 1   Model               2041 non-null   object 
 2   Price               2041 non-null   int64  
 3   Year                2041 non-null   int64  
 4   Kilometer           2041 non-null   int64  
 5   Fuel Type           2041 non-null   object 
 6   Transmission        2041 non-null   object 
 7   Color               2041 non-null   object 
 8   Owner               2041 non-null   object 
 9   Seller Type         2041 non-null   object 
 10  Engine              2041 non-null   object 
 11  Max Power           2041 non-null   object 
 12  Max Torque          2041 non-null   object 
 13  Drivetrain          2041 non-null   object 
 14  Length              2041 non-null   float64
 15  Width               2041 non-null   float64
 16  Height

In [20]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict

In [21]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [22]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            unzip_data_dir = config.unzip_data_dir,
            all_schema=schema,
        )

        return data_validation_config

In [23]:
import os
from mlProject import logger

In [26]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config


    def validate_all_columns(self)-> bool:
        try:
            validation_status = None

            df = pd.read_csv(self.config.unzip_data_dir)


            df["TorquePower"] = [None if str(i) == "nan" else int(i.split("@")[0]) if len(i.split(" ")) == 1 else int(i.split(" ")[0]) for i in df["Max Torque"]]
            df["TorquePowerRPM"] = [None if str(i) == "nan" else int(i.split("@")[1]) if len(i.split(" ")) == 1 else int(i.split(" ")[-2]) for i in df["Max Torque"]]
            df["HorsePower"] = [None if str(i) == "nan" else int(i.split("@")[0]) if len(i.split(" ")) == 1 else int(i.split(" ")[0]) for i in df["Max Power"]]
            df["HorsePowerRPM"] = [None if str(i) == "nan" else int(i.split("@")[1]) if len(i.split(" ")) == 1 else int(i.split(" ")[-2]) for i in df["Max Power"]]
            df["Engine"] = [None if str(i) == "nan" else int(i.split(" ")[0]) for i in df["Engine"]]
            df.drop(["Max Power","Model","Max Torque"], axis=1, inplace=True)
            df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
            df.drop_duplicates(inplace=True)
            num_features = df.select_dtypes(exclude="object").columns
            cat_features = df.select_dtypes(include="object").columns
            df[num_features] = df[num_features].fillna(df[num_features].mean())
            df[cat_features] = df[cat_features].fillna(df[cat_features].mode().iloc[0])
            
            
            data=df
            
            data.to_csv(f"{self.config.root_dir}/cars.csv",index=False)

            all_cols = list(data.columns)

            all_schema = self.config.all_schema.keys()

            
            for col in all_cols:
                if col not in all_schema:
                    validation_status = False
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")
                else:
                    validation_status = True
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")

            return validation_status
        
        except Exception as e:
            raise e

In [27]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_all_columns()
except Exception as e:
    raise e

[2023-08-08 17:10:35,016: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-08-08 17:10:35,017: INFO: common: yaml file: params.yaml loaded successfully]
[2023-08-08 17:10:35,019: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-08-08 17:10:35,020: INFO: common: created directory at: artifacts]
[2023-08-08 17:10:35,021: INFO: common: created directory at: artifacts/data_validation]
