# Data Analysis for CS506 Project

In CS506_DataProcessing_Graph.ipynb file, datasets are processed and cleaned finally store in 2 csv files. Two files are: **df_201x_BU_analysis_E_Final.csv** and **df_201x_BU_analysis_G_Final.csv**. In this analysis, we directly use those two files to do the analysis

In [1]:
# Import files
import csv
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [2]:
# Read data
df_BU_analysis_E = pd.read_csv('df_201x_BU_analysis_E_Final.csv', index_col=0)
df_BU_analysis_G = pd.read_csv('df_201x_BU_analysis_G_Final.csv', index_col=0)

In [3]:
# Browsing data for Electricity
# df_BU_analysis_E.head(10)

In [4]:
# Browsing data for Gas
# df_BU_analysis_G.head(10)

## Linear Regression

In [5]:
# Linear Regression:

# Using Sklearn linear model to do the prediction
linearModel_E = LinearRegression(fit_intercept=True, normalize = True)
linearModel_G = LinearRegression(fit_intercept=True, normalize = True)

# Data for E and G
print(df_BU_analysis_G.columns)
print(df_BU_analysis_E.columns)



# Generating Training and Testing sets for 2 regressions (Electricity, Gas)
df_BU_analysis_E_shuffle = df_BU_analysis_E.sample(frac = 1)
df_BU_analysis_G_shuffle = df_BU_analysis_G.sample(frac = 1)

num_e = int(len(df_BU_analysis_E_shuffle)*0.9)
num_g = int(len(df_BU_analysis_G_shuffle)*0.9)

df_BU_training_E = df_BU_analysis_E_shuffle.iloc[:num_e, :]
df_BU_testing_E = df_BU_analysis_E_shuffle.iloc[num_e:, :]

df_BU_training_G = df_BU_analysis_G_shuffle.iloc[:num_g, :]
df_BU_testing_G = df_BU_analysis_G_shuffle.iloc[num_g:, :]


X_column = ['Temperature', 'WI', 'GHGI', 'Age', 'Uses_College', 'Uses_Sports', 'Uses_ResidentHousing', 'Uses_Laboratory', 'Uses_Office', 'Uses_Hotel', 'Uses_Distribution', 'Uses_Food', 'Uses_Worship', 'Uses_Medical']
# X_column = ['Temperature', 'WI', 'Age', 'Uses_College', 'Uses_Sports', 'Uses_ResidentHousing', 'Uses_Laboratory', 'Uses_Office', 'Uses_Hotel', 'Uses_Distribution', 'Uses_Food', 'Uses_Worship', 'Uses_Medical']


# Traning Data
X_E = df_BU_training_E[X_column]
X_G = df_BU_training_G[X_column]

Y_E = df_BU_training_E['Y_Electric']
Y_G = df_BU_training_G['Y_Gas']


# Testing Data
X_E_test = df_BU_testing_E[X_column]
X_G_test = df_BU_testing_G[X_column]

Y_E_test = df_BU_testing_E['Y_Electric']
Y_G_test = df_BU_testing_G['Y_Gas']

print('BU_data shape')
print(df_BU_analysis_E.shape)
print(df_BU_analysis_G.shape)
print(df_BU_analysis_E_shuffle.shape)
print(df_BU_analysis_G_shuffle.shape)

print()
print('Training Data Shape:')
print(X_E.shape)
print(X_G.shape)
print()
print(Y_E.shape)
print(Y_G.shape)


print()
print('Testing Data Shape:')
print(X_E_test.shape)
print(X_G_test.shape)
print()
print(Y_E_test.shape)
print(Y_G_test.shape)

Index(['Temperature', 'WI', 'GHGI', 'Age', 'Uses_College', 'Uses_Sports',
       'Uses_ResidentHousing', 'Uses_Laboratory', 'Uses_Office', 'Uses_Hotel',
       'Uses_Distribution', 'Uses_Food', 'Uses_Worship', 'Uses_Medical',
       'Y_Gas', 'Y_G_Level'],
      dtype='object')
Index(['Temperature', 'WI', 'GHGI', 'Age', 'Uses_College', 'Uses_Sports',
       'Uses_ResidentHousing', 'Uses_Laboratory', 'Uses_Office', 'Uses_Hotel',
       'Uses_Distribution', 'Uses_Food', 'Uses_Worship', 'Uses_Medical',
       'Y_Electric', 'Y_E_Level'],
      dtype='object')
BU_data shape
(1512, 16)
(1404, 16)
(1512, 16)
(1404, 16)

Training Data Shape:
(1360, 14)
(1263, 14)

(1360,)
(1263,)

Testing Data Shape:
(152, 14)
(141, 14)

(152,)
(141,)


In [6]:
# Run the regression:

X_column = ['Temperature', 'WI', 'GHGI', 'Age', 'Uses_College', 'Uses_Sports', 'Uses_ResidentHousing', 'Uses_Laboratory', 'Uses_Office', 'Uses_Hotel', 'Uses_Distribution', 'Uses_Food', 'Uses_Worship', 'Uses_Medical']
# X_column = ['Temperature', 'WI', 'Age', 'Uses_College', 'Uses_Sports', 'Uses_ResidentHousing', 'Uses_Laboratory', 'Uses_Office', 'Uses_Hotel', 'Uses_Distribution', 'Uses_Food', 'Uses_Worship', 'Uses_Medical']


linearModel_E.fit(X_E, Y_E)
linearModel_G.fit(X_G, Y_G)

# linear regression accuracy through cross validation
score_E = cross_val_score(linearModel_E, X_E, Y_E, cv=3)
score_linear_mean_E = score_E.mean()
score_G = cross_val_score(linearModel_G, X_G, Y_G, cv=3)
score_linear_mean_G = score_G.mean()



np.set_printoptions(suppress=True)

print('E: Parameters')
print(linearModel_E.coef_.shape)
print(linearModel_E.intercept_)

print()
print('G: Parameters')
print(linearModel_G.coef_.shape)
print(linearModel_G.intercept_)


coef_E = linearModel_E.coef_
coef_G = linearModel_G.coef_

coef_E = coef_E.reshape(coef_E.shape[0], 1)
coef_G = coef_G.reshape(coef_G.shape[0], 1)

df_coef_E = pd.DataFrame(coef_E.T, columns = X_column)
df_coef_G = pd.DataFrame(coef_G.T, columns = X_column)

df_coef_E['Const'] = linearModel_E.intercept_
df_coef_G['Const'] = linearModel_G.intercept_
# print(df_coef_E)
# print()
# print(df_coef_G)

E: Parameters
(14,)
-48.13325201120033

G: Parameters
(14,)
61.13502109348905


In [7]:
df_coef_E

Unnamed: 0,Temperature,WI,GHGI,Age,Uses_College,Uses_Sports,Uses_ResidentHousing,Uses_Laboratory,Uses_Office,Uses_Hotel,Uses_Distribution,Uses_Food,Uses_Worship,Uses_Medical,Const
0,0.013288,0.018409,3.772179,0.016693,16.137848,15.964847,14.702094,19.421576,16.342977,16.177176,15.343214,16.583071,14.452517,17.881133,-48.133252


In [8]:
df_coef_G

Unnamed: 0,Temperature,WI,GHGI,Age,Uses_College,Uses_Sports,Uses_ResidentHousing,Uses_Laboratory,Uses_Office,Uses_Hotel,Uses_Distribution,Uses_Food,Uses_Worship,Uses_Medical,Const
0,-0.187267,-0.427835,12.531559,-0.027002,-0.344456,-0.227508,3.051102,-5.384842,0.085196,0.558924,0.649408,-1.105248,1.984628,-1.575424,61.135021


In [9]:
# Mean Squared Error

#predicted comments according to wights
def to_pred(f_coef,f_data):
    # feature dot product with weights
    # feature f_data from testing group
    # weight f_coef from training
    # predicted comments 
    constant = f_coef.loc[0,'Const']
    f_coef.drop(['Const'], axis = 1, inplace= True)
    f_coef = f_coef.T
    pred = np.dot(f_data,f_coef)
    pred = pred + constant
    return pred


#prediction of testing group 
y_pred_E = to_pred(df_coef_E,X_E_test)
y_pred_G = to_pred(df_coef_G,X_G_test)


#true value of testing group
y_true_E = Y_E_test
y_true_G = Y_G_test

#mean squared error
print('---------------Mean Suared Error of Electric---------------')
print(mean_squared_error(y_true_E, y_pred_E))
print('---------------Mean Suared Error of Gas---------------')
print(mean_squared_error(y_true_G, y_pred_G))

---------------Mean Suared Error of Electric---------------
3.633350040459854
---------------Mean Suared Error of Gas---------------
14.454162619617886


In [10]:
print('Accuracy of linear model on electricity is',score_linear_mean_E)
print('Accuracy of linear model on gas is', score_linear_mean_G)

Accuracy of linear model on electricity is 0.800500625002194
Accuracy of linear model on gas is 0.6801249101157193


## Multi-class Logistic Regression

In [12]:
# Run Logistic Regression
# df_BU_analysis_E
# df_BU_analysis_G


# Traning Data
X_E = df_BU_training_E[X_column]
X_G = df_BU_training_G[X_column]

Y_E = df_BU_training_E['Y_E_Level']
Y_G = df_BU_training_G['Y_G_Level']


# Testing Data
X_E_test = df_BU_testing_E[X_column]
X_G_test = df_BU_testing_G[X_column]

Y_E_test = df_BU_testing_E['Y_E_Level']
Y_G_test = df_BU_testing_G['Y_G_Level']




logr_model_E = LogisticRegression()
logr_model_G = LogisticRegression()

logr_model_E.fit(X_E, Y_E)
logr_model_G.fit(X_G, Y_G)

score_E = cross_val_score(logr_model_E, X_E, Y_E, cv=3)
score_G = cross_val_score(logr_model_G, X_G, Y_G, cv=3)

score_logr_mean_E = score_E.mean()
score_logr_mean_G = score_G.mean()

print('Accuracy of logistic model on electricity is',score_logr_mean_E)
print('Accuracy of logistic model on gas is', score_logr_mean_G)

Accuracy of logistic model on electricity is 0.8390122323505799
Accuracy of logistic model on gas is 0.8741185645153234




In [15]:
logr_E_coef = logr_model_E.coef_
logr_G_coef = logr_model_G.coef_
print(logr_E_coef[0, :])
# print(logr_G_coef.shape)

[-0.01461669 -0.00332248 -5.28712221  0.00242437  1.067849    0.17886347
  1.9370223  -1.78204702 -0.49903574 -0.23910736  1.41726598 -1.75753664
  0.39209047 -0.30107442]


## Multi-class SVM

In [None]:
# Run SVM


## Neural Networks

In [None]:
# Run NN
