# **Importing the Packages**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
import xgboost as xgb
import seaborn as sns
%matplotlib inline

# **Importing the Files to a Dataframe**

In [None]:
df1 = pd.read_csv('/content/2015-building-energy-benchmarking.csv')
df2 = pd.read_csv('/content/2016-building-energy-benchmarking.csv')

# **Preprocessing**

In [None]:
df2.rename(columns = {'TotalGHGEmissions':'GHGEmissions(MetricTonsCO2e)','GHGEmissionsIntensity':'GHGEmissionsIntensity(kgCO2e/ft2)'}, inplace = True)

In [None]:
Cols = []

for i in df1.columns:
  if i in list(df2.columns):
    Cols.append(i)

In [None]:
df_1 = df1[Cols]
df_2 = df2[Cols]

In [None]:
final_df = pd.concat([df_1,df_2],axis=0)

In [None]:
final_df.drop(['OSEBuildingID','DataYear','BuildingType','PrimaryPropertyType','PropertyName','TaxParcelIdentificationNumber','CouncilDistrictCode','Neighborhood','YearBuilt','PropertyGFATotal','ListOfAllPropertyUseTypes',
               'LargestPropertyUseTypeGFA','SecondLargestPropertyUseType','SecondLargestPropertyUseTypeGFA','ThirdLargestPropertyUseType','ThirdLargestPropertyUseTypeGFA','YearsENERGYSTARCertified','ENERGYSTARScore',
               'SiteEUIWN(kBtu/sf)','SiteEnergyUse(kBtu)','SiteEnergyUseWN(kBtu)','SourceEUIWN(kBtu/sf)','Electricity(kWh)','NaturalGas(therms)','GHGEmissionsIntensity(kgCO2e/ft2)','DefaultData','ComplianceStatus','Outlier'],axis=1,inplace = True)

In [None]:
final_df

## **Plots**

### **Plots for Continuous Variables**

In [None]:
sns.set( style = "white" )
rs = np.random.RandomState( 10 )

In [None]:
fig = plt.figure(figsize = (20,14))
final_df.hist(column=['PropertyGFABuilding(s)'],bins=5, figsize=(20,14))

In [None]:
fig = plt.figure(figsize = (20,14))
final_df.hist(column=['PropertyGFAParking'],bins=5, figsize=(20,14))

In [None]:
fig = plt.figure(figsize = (20,14))
final_df.hist(column=['SiteEUI(kBtu/sf)'],bins=5, figsize=(20,14))

In [None]:
fig = plt.figure(figsize = (20,14))
final_df.hist(column=['SourceEUI(kBtu/sf)'],bins=5, figsize=(20,14))

In [None]:
fig = plt.figure(figsize = (20,14))
final_df.hist(column=['SteamUse(kBtu)'],bins=5, figsize=(20,14))

In [None]:
fig = plt.figure(figsize = (20,14))
final_df.hist(column=['Electricity(kBtu)'],bins=5, figsize=(20,14))

In [None]:
fig = plt.figure(figsize = (20,14))
final_df.hist(column=['NaturalGas(kBtu)'],bins=5, figsize=(20,14))

### **Plots for categorical Data**

In [None]:
fig = plt.figure(figsize = (20,14))
sns.countplot(final_df['NumberofBuildings'])

In [None]:
fig = plt.figure(figsize = (20,14))
sns.countplot(final_df['NumberofFloors'])

In [None]:
fig = plt.figure(figsize = (20,14))
sns.countplot(final_df['LargestPropertyUseType'])

## **Label Encoder for Encoding String Categorical features**

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
final_df['LargestPropertyUseType'] = le.fit_transform(final_df['LargestPropertyUseType'])

In [None]:
final_df

## **Checking for Null Values and eliminating them**

In [None]:
final_df.isna().sum()

In [None]:
final_df=final_df.dropna()

## **Splitting the data into Dependent and Independent Variables**

In [None]:
X = final_df.iloc[:,:-1]
Y = final_df.iloc[:,-1]

# **Train-Test Split**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

# **Models**

## **Random Forest Regressor**

### **Random Forest Regressor(Training)**

In [None]:
regr = RandomForestRegressor()
regr.fit(X_train, y_train)

### **Random Forest Regressor(Prediction)**

In [None]:
Prediction = regr.predict(X_test)

### **R2-Score**

In [None]:
from sklearn.metrics import r2_score
Metric = r2_score(y_test,Prediction)
Metric

## **Xgboost**

### **Xgboost Regressor(Training)**

In [None]:
regr = xgb.XGBRegressor()
regr.fit(X_train, y_train)

### **Xgboost Regressor(Prediction)**

In [None]:
Prediction = regr.predict(X_test)

### **R2-Score**

In [None]:
from sklearn.metrics import r2_score
Metric = r2_score(y_test,Prediction)
Metric

## **ExtraTrees Regressor**

### **ExtraTrees Regressor(Training)**

In [None]:
regr = ExtraTreesRegressor()
regr.fit(X_train, y_train)

### **ExtraTrees Regressor(Prediction)**

In [None]:
Prediction = regr.predict(X_test)

### **R2-Score**

In [None]:
from sklearn.metrics import r2_score
Metric = r2_score(y_test,Prediction)
Metric

# **Predicting Unseen Point**

In [None]:
Data_Point = final_df.iloc[1867,:-1].to_numpy()

In [None]:
Prediction_true = final_df.iloc[1867,-1]
Prediction_true

In [None]:
Prediction_Model = regr.predict(Data_Point.reshape(1, -1))
Prediction_Model