# Importing Dependencies

In [None]:
!pip install sdv

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.single_table import CTGANSynthesizer

# Downloading Building Genome 2 Dataset

In [None]:
!mkdir raw
!curl -o raw/metadata.csv https://media.githubusercontent.com/media/buds-lab/building-data-genome-project-2/master/data/metadata/metadata.csv
!curl -o raw/weather.csv https://media.githubusercontent.com/media/buds-lab/building-data-genome-project-2/master/data/weather/weather.csv
!curl -o raw/electricity_cleaned.csv https://media.githubusercontent.com/media/buds-lab/building-data-genome-project-2/master/data/meters/cleaned/electricity_cleaned.csv

# Filling in missing data

## Pre processing Raw Data

In [None]:
fields = ["building_id", "sub_primaryspaceusage", "sqft", "yearbuilt", "numberoffloors"]
dfm = pd.read_csv("raw/metadata.csv", usecols=fields)
dfm=dfm.set_index('building_id')

dfe = pd.read_csv("raw/electricity_cleaned.csv")
dfe_cleaned = pd.DataFrame(dfe.drop(['timestamp'], axis=1).mean()).reset_index()
dfe_cleaned.columns = ["building_id","power_consumption"]
dfe_cleaned=dfe_cleaned.set_index("building_id")

data = pd.merge(dfm, dfe_cleaned, left_index=True, right_index=True).reset_index()

weather = ["site_id", "airTemperature", "dewTemperature"]
dfw = pd.read_csv("raw/weather.csv", usecols=weather)

dfw_mean = dfw.groupby("site_id").mean()
dfw_mean.reset_index()

data['site_id'] = data['building_id'].apply(lambda x: x.split('_')[0])
merged_df = pd.merge(data, dfw_mean, on='site_id', how='left')

def calculate_relative_humidity(air_temp, dew_temp):
    numerator = np.exp((17.625 * dew_temp) / (dew_temp + 243.04))
    denominator = np.exp((17.625 * air_temp) / (air_temp + 243.04))
    humidity = 100 * (numerator / denominator)
    return humidity

merged_df['humidity'] = calculate_relative_humidity(merged_df['airTemperature'], merged_df['dewTemperature'])

final_df = merged_df.drop(["building_id", "site_id", "dewTemperature"], axis = 1)

new_column_order = ['sub_primaryspaceusage', 'sqft', 'yearbuilt', 'numberoffloors', 'airTemperature', "humidity", 'power_consumption']

final_df = final_df[new_column_order]

final_df.columns = ['building_type', 'sqft', 'yearbuilt', 'numberoffloors', 'airTemperature', "humidity", 'power_consumption']

final_df = final_df[final_df['power_consumption'].notna()]

final_df

## Rows with no missing values

In [None]:
no_na = final_df.dropna()
no_na

## Fit Gaussian Copula to rows with no missing values

In [None]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(no_na) 

synthesizer = GaussianCopulaSynthesizer(
    metadata,
    enforce_rounding=True,
    enforce_min_max_values=False,
    default_distribution='gaussian_kde',
)

sqft_r = {
    'constraint_class': 'ScalarRange',
    'constraint_parameters': {
        'column_name': 'sqft',
        'low_value': final_df['sqft'].min(),
        'high_value': final_df['sqft'].max(),
        'strict_boundaries': False
    }
}

numberoffloors_r = {
    'constraint_class': 'ScalarRange',
    'constraint_parameters': {
        'column_name': 'numberoffloors',
        'low_value': 1,
        'high_value': 100,
        'strict_boundaries': False
    }
}

yearbuilt_r = {
    'constraint_class': 'ScalarRange',
    'constraint_parameters': {
        'column_name': 'yearbuilt',
        'low_value': final_df['yearbuilt'].min(),
        'high_value': final_df['yearbuilt'].max(),
        'strict_boundaries': False
    }
}
    
power_r = {
    'constraint_class': 'Positive',
    'constraint_parameters': {
        'column_name': 'power_consumption',
        'strict_boundaries': True
    }
}

synthesizer.add_constraints(
constraints=[sqft_r, numberoffloors_r, yearbuilt_r, power_r]
)
synthesizer.fit(no_na)

### Evaluating how well this fits the actual distribution

In [None]:
SAMPLES = 300

samples = synthesizer.sample(
    num_rows=SAMPLES,
    batch_size=10
)
samples

In [None]:
eval_real = no_na.copy()
eval_generated = samples.copy()

le = LabelEncoder()
eval_real['building_type'] = le.fit_transform(eval_real['building_type'])
eval_generated['building_type'] = le.fit_transform(eval_generated['building_type'])

(eval_real['sqft'] < 0).sum()

In [None]:
fig = plt.figure(figsize=(5,10))
fig.subplots_adjust(hspace=1, wspace=0.4)

for n,cat in enumerate(eval_real.columns):
    ax = fig.add_subplot(8, 1, n+1)
    sns.kdeplot(eval_real[cat], label = 'real', ax=ax)
    sns.kdeplot(eval_generated[cat], label = 'fake', ax = ax)
    plt.legend()

In [None]:
fig_corr = plt.figure(figsize=(20,10))
ax = fig_corr.add_subplot(1, 3, 1)
sns.heatmap(eval_real.corr(), annot=True, ax=ax)
ax = fig_corr.add_subplot(1, 3, 2)
sns.heatmap(eval_generated.corr(), annot=True, ax=ax)
ax = fig_corr.add_subplot(1, 3, 3)
sns.heatmap(abs(eval_real.corr()-eval_generated.corr()), annot=True, ax=ax)

## Using samples from this distribution to fill in the missing values

In [None]:
final_df.isnull().sum(axis = 0)

In [None]:
na_lst = ['building_type','yearbuilt','numberoffloors']

for it in na_lst:
    final_df.loc[final_df[it].isna(), it] = synthesizer.sample_remaining_columns(
    known_columns=final_df.loc[final_df[it].isna()].drop(['building_type','yearbuilt','numberoffloors'], axis=1),
    max_tries_per_batch=100
    )[it]

final_df.isnull().sum(axis = 0)

## Saving Result

In [None]:
final_df.index.name = 'building_id'
final_df.to_csv("real_data.csv")

# Generating New Synthetic Data

## Set Parameters

In [None]:
CULL_FRACTION = 1 #amount of dataset to use
TRAIN_FRACTION = 0.8 #train fraction
EPOCHS = 2500 #no of iterations
SAMPLES = 10000 #no of samples to generate

## Load Dataset

In [None]:
df = pd.read_csv("real_data.csv").set_index("building_id")
cull_df = df.sample(frac=CULL_FRACTION).reset_index(drop=True)
train_data, test_data = train_test_split(cull_df, train_size=TRAIN_FRACTION)
print(train_data.shape, test_data.shape)

## Train CTGAN

In [None]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(train_data) 

synthesizer = CTGANSynthesizer(
    metadata,
    enforce_rounding=True,
    enforce_min_max_values=True,
    epochs=EPOCHS,
    verbose=True
)
synthesizer.fit(train_data)

In [None]:
samples = synthesizer.sample(
    num_rows=SAMPLES,
    batch_size=1000
)
samples

### Evaluating GAN

In [None]:
eval_real = test_data.copy()
eval_generated = samples.copy()

le = LabelEncoder()
eval_real['building_type'] = le.fit_transform(test_data['building_type'])
eval_generated['building_type'] = le.fit_transform(samples['building_type'])

In [None]:
fig = plt.figure(figsize=(5,10))
fig.subplots_adjust(hspace=1, wspace=0.4)

for n,cat in enumerate(eval_real.columns):
    ax = fig.add_subplot(8, 1, n+1)
    sns.kdeplot(eval_real[cat], label = 'real', ax=ax)
    sns.kdeplot(eval_generated[cat], label = 'fake', ax = ax)
    plt.legend()

In [None]:
fig_corr = plt.figure(figsize=(20,10))
ax = fig_corr.add_subplot(1, 3, 1)
sns.heatmap(eval_real.corr(), annot=True, ax=ax)
ax = fig_corr.add_subplot(1, 3, 2)
sns.heatmap(eval_generated.corr(), annot=True, ax=ax)
ax = fig_corr.add_subplot(1, 3, 3)
sns.heatmap(abs(eval_real.corr()-eval_generated.corr()), annot=True, ax=ax)

## Saving Extended Dataset

In [None]:
samples.to_csv("generated_data.csv")