# **Data Integrity - Specific Checks**

In [2]:
# !pip install deepchecks
# !pip install prettytable

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import pandas as pd

df = pd.read_csv('Site_EUI_data.csv')
df.head()

Unnamed: 0,Year_Factor,State_Factor,building_class,facility_type,floor_area,year_built,energy_star_rating,ELEVATION,january_min_temp,january_avg_temp,...,days_above_80F,days_above_90F,days_above_100F,days_above_110F,direction_max_wind_speed,direction_peak_wind_speed,max_wind_speed,days_with_fog,site_eui,id
0,1,State_1,Commercial,Grocery_store_or_food_market,61242.0,1942.0,11.0,2.4,36,50.5,...,14,0,0,0,1.0,1.0,1.0,,248.682615,0
1,1,State_1,Commercial,Warehouse_Distribution_or_Shipping_center,274000.0,1955.0,45.0,1.8,36,50.5,...,14,0,0,0,1.0,,1.0,12.0,26.50015,1
2,1,State_1,Commercial,Retail_Enclosed_mall,280025.0,1951.0,97.0,1.8,36,50.5,...,14,0,0,0,1.0,,1.0,12.0,24.693619,2
3,1,State_1,Commercial,Education_Other_classroom,55325.0,1980.0,46.0,1.8,36,50.5,...,14,0,0,0,1.0,,1.0,12.0,48.406926,3
4,1,State_1,Commercial,Warehouse_Nonrefrigerated,66000.0,1985.0,100.0,2.4,36,50.5,...,14,0,0,0,1.0,1.0,1.0,,3.899395,4


In [5]:
df.dtypes

Year_Factor                    int64
State_Factor                  object
building_class                object
facility_type                 object
floor_area                   float64
                              ...   
direction_peak_wind_speed    float64
max_wind_speed               float64
days_with_fog                float64
site_eui                     float64
id                             int64
Length: 64, dtype: object

**Check for missing values**

In [6]:
from deepchecks.tabular.checks import MixedNulls

MixedNulls().run(df)



VBox(children=(HTML(value='<h4><b>Mixed Nulls</b></h4>'), HTML(value='<p>Search for various types of null valu…

**Check for percentage of Nulls**

In [7]:
from deepchecks.tabular.checks.data_integrity import PercentOfNulls

result = PercentOfNulls().run(df)
result.show()



VBox(children=(HTML(value='<h4><b>PercentOfNulls</b></h4>'), HTML(value='<p>Percent of \'Null\' values in each…

In [8]:
from prettytable import PrettyTable

missing_values_count = df.isna().sum()
missing_values_count_sorted = missing_values_count.sort_values(ascending=False).head(20)

table = PrettyTable()
table.field_names = ["Column", "Missing Values Count"]

for column, count in missing_values_count_sorted.items():
    table.add_row([column, count])

table.align["Column"] = "l"
table.align["Missing Values Count"] = "r"

print(table)

+---------------------------+----------------------+
| Column                    | Missing Values Count |
+---------------------------+----------------------+
| days_with_fog             |                45796 |
| direction_peak_wind_speed |                41811 |
| max_wind_speed            |                41082 |
| direction_max_wind_speed  |                41082 |
| energy_star_rating        |                26709 |
| year_built                |                 1837 |
| Year_Factor               |                    0 |
| november_max_temp         |                    0 |
| cooling_degree_days       |                    0 |
| december_max_temp         |                    0 |
| december_avg_temp         |                    0 |
| december_min_temp         |                    0 |
| october_avg_temp          |                    0 |
| november_avg_temp         |                    0 |
| november_min_temp         |                    0 |
| october_max_temp          |                 

String Mismatch

In [9]:
from deepchecks.tabular.checks import StringMismatch
result = StringMismatch().run(df)
result.show()



VBox(children=(HTML(value='<h4><b>String Mismatch</b></h4>'), HTML(value='<p>Detect different variants of stri…

## **Data Integrity Suite**

In [10]:
from deepchecks.tabular.suites import data_integrity

integ_suite = data_integrity()
suite_result = integ_suite.run(df)
suite_result.show()



Accordion(children=(VBox(children=(HTML(value='\n<h1 id="summary_K5YU58WIMYJTXLA4GTBVNOVZ5">Data Integrity Sui…

## **Train Test Validation - Specific Checks**

**Train Test Sample Mix**

In [11]:
from deepchecks.tabular import Dataset
from deepchecks.tabular.checks import TrainTestSamplesMix
from deepchecks.tabular.datasets.classification import iris

# Create data with leakage from train to test
train, test = iris.load_data()
bad_test_df = test.data._append(train.data.iloc[[0, 1, 1, 2, 3, 4, 2, 2, 10]], ignore_index=True)
bad_test = test.copy(bad_test_df)

check = TrainTestSamplesMix()
result = check.run(test_dataset=bad_test, train_dataset=train)
result

VBox(children=(HTML(value='<h4><b>Train Test Samples Mix</b></h4>'), HTML(value='<p>Detect samples in the test…

# **Train Test Validation - Suite**

In [12]:
from sklearn.model_selection import train_test_split

df_split = df.copy()
train = df_split.drop(columns='site_eui', axis = 1)
labels = df_split["site_eui"]

X_train, X_test, y_train, y_test = train_test_split(train,labels, test_size = 0.3, random_state = 42)
X_train.shape, X_test.shape

((53029, 63), (22728, 63))

In [13]:
columns = train.columns
categorical_columns = []
for column in columns:
    if df[column].dtype == 'object':
        categorical_columns.append(column)

print(categorical_columns)

['State_Factor', 'building_class', 'facility_type']


In [15]:
from deepchecks.tabular import Dataset

train_ds = Dataset(X_train, label=y_train,cat_features=categorical_columns)
test_ds = Dataset(X_test, label=y_test,cat_features=categorical_columns)

In [16]:
from deepchecks.tabular.suites import train_test_validation

validation_suite = train_test_validation()
suite_result = validation_suite.run(train_ds, test_ds)

suite_result

Accordion(children=(VBox(children=(HTML(value='\n<h1 id="summary_B6F1RUL25E2QPR0VL62U0PZPC">Train Test Validat…

## **Model Evaluation - Specific Checks**

to train a model I am making encodings of each categorical variable to make it train

In [17]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

df = pd.read_csv("Site_EUI_data.csv")
df_copy = df.copy(deep = True)
categorical_columns = list(df_copy.select_dtypes(include=['object', 'category']).columns)

df_copy.drop(categorical_columns, axis = 1, inplace = True)
df_copy.head()

Unnamed: 0,Year_Factor,floor_area,year_built,energy_star_rating,ELEVATION,january_min_temp,january_avg_temp,january_max_temp,february_min_temp,february_avg_temp,...,days_above_80F,days_above_90F,days_above_100F,days_above_110F,direction_max_wind_speed,direction_peak_wind_speed,max_wind_speed,days_with_fog,site_eui,id
0,1,61242.0,1942.0,11.0,2.4,36,50.5,68,35,50.589286,...,14,0,0,0,1.0,1.0,1.0,,248.682615,0
1,1,274000.0,1955.0,45.0,1.8,36,50.5,68,35,50.589286,...,14,0,0,0,1.0,,1.0,12.0,26.50015,1
2,1,280025.0,1951.0,97.0,1.8,36,50.5,68,35,50.589286,...,14,0,0,0,1.0,,1.0,12.0,24.693619,2
3,1,55325.0,1980.0,46.0,1.8,36,50.5,68,35,50.589286,...,14,0,0,0,1.0,,1.0,12.0,48.406926,3
4,1,66000.0,1985.0,100.0,2.4,36,50.5,68,35,50.589286,...,14,0,0,0,1.0,1.0,1.0,,3.899395,4


In [20]:
#filling missing values

import pandas as pd
from sklearn.impute import SimpleImputer

missing_percentages = df_copy.isnull().mean() * 100

columns_to_drop = missing_percentages[missing_percentages > 85].index
df_dropped = df_copy.drop(columns_to_drop, axis=1)

columns_to_impute = missing_percentages[missing_percentages <= 85].index
imputer = SimpleImputer(strategy='mean')

df_imputed = pd.DataFrame(imputer.fit_transform(df_dropped[columns_to_impute]), columns=columns_to_impute)

# Combine dropped columns and imputed columns
df_processed = pd.concat([df_dropped.drop(columns_to_impute, axis=1), df_imputed], axis=1)
df_processed.shape

(75757, 61)

In [21]:
train = df_processed.drop('site_eui', axis = 1)
labels = df_processed['site_eui']

X_train, X_test, y_train, y_test = train_test_split(train,labels, test_size = 0.3, random_state = 42)
X_train.shape, X_test.shape

((53029, 60), (22728, 60))

In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score


regressor = RandomForestRegressor()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

accuracy = r2_score(y_test, y_pred)

print("Accuracy:", accuracy)

Accuracy: 0.3022101654191087


In [23]:
from deepchecks.tabular.checks import ModelInfo

ModelInfo().run(regressor)

VBox(children=(HTML(value='<h4><b>Model Info</b></h4>'), HTML(value='<p>Summarize given model parameters. <a h…

In [24]:
from deepchecks.tabular import Dataset

train_ds = Dataset(X_train, label=y_train, cat_features=[])
test_ds = Dataset(X_test, label=y_test, cat_features=[])

In [25]:
from deepchecks.tabular.suites import model_evaluation

evaluation_suite = model_evaluation()
suite_result = evaluation_suite.run(train_ds, test_ds, regressor)
suite_result.show()

Accordion(children=(VBox(children=(HTML(value='\n<h1 id="summary_RTRXPWF2XJM3O9ZY2UGRQYT3Y">Model Evaluation S…