# Data Analysis for CS506 Project

In CS506_DataProcessing_Graph.ipynb file, datasets are processed and cleaned finally store in 2 csv files. Two files are: **df_201x_BU_analysis_E_Final.csv** and **df_201x_BU_analysis_G_Final.csv**. In this analysis, we directly use those two files to do the analysis

In [1]:
# Import files
import csv
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
# import statsmodels.formula.api as smf

  from pandas.core import datetools


In [2]:
# Read data
df_BU_analysis_E = pd.read_csv('df_201x_BU_analysis_E_Final.csv', index_col=0)
df_BU_analysis_G = pd.read_csv('df_201x_BU_analysis_G_Final.csv', index_col=0)

# Data for E and G
print(df_BU_analysis_G.columns)
print(df_BU_analysis_E.columns)

Index(['Temperature', 'WI', 'GHGI', 'Age', 'Uses_College', 'Uses_Sports',
       'Uses_ResidentHousing', 'Uses_Laboratory', 'Uses_Office', 'Uses_Hotel',
       'Uses_Distribution', 'Uses_Food', 'Uses_Worship', 'Uses_Medical',
       'Y_Gas', 'Y_G_Level'],
      dtype='object')
Index(['Temperature', 'WI', 'GHGI', 'Age', 'Uses_College', 'Uses_Sports',
       'Uses_ResidentHousing', 'Uses_Laboratory', 'Uses_Office', 'Uses_Hotel',
       'Uses_Distribution', 'Uses_Food', 'Uses_Worship', 'Uses_Medical',
       'Y_Electric', 'Y_E_Level'],
      dtype='object')


In [3]:
# Browsing data for Electricity
# df_BU_analysis_E.head(10)

In [4]:
# Browsing data for Gas
# df_BU_analysis_G.head(10)

In [5]:
# Generating Training and Testing sets for 2 regressions (Electricity, Gas)
df_BU_analysis_E_shuffle = df_BU_analysis_E.sample(frac = 1)
df_BU_analysis_G_shuffle = df_BU_analysis_G.sample(frac = 1)

num_e = int(len(df_BU_analysis_E_shuffle)*0.9)
num_g = int(len(df_BU_analysis_G_shuffle)*0.9)

df_BU_training_E = df_BU_analysis_E_shuffle.iloc[:num_e, :]
df_BU_testing_E = df_BU_analysis_E_shuffle.iloc[num_e:, :]

df_BU_training_G = df_BU_analysis_G_shuffle.iloc[:num_g, :]
df_BU_testing_G = df_BU_analysis_G_shuffle.iloc[num_g:, :]

X_column = ['Temperature', 'WI', 'GHGI', 'Age', 'Uses_College', 'Uses_Sports', 'Uses_ResidentHousing', 'Uses_Laboratory', 'Uses_Office', 'Uses_Hotel', 'Uses_Distribution', 'Uses_Food', 'Uses_Worship', 'Uses_Medical']
# X_column = ['Temperature', 'WI', 'Age', 'Uses_College', 'Uses_Sports', 'Uses_ResidentHousing', 'Uses_Laboratory', 'Uses_Office', 'Uses_Hotel', 'Uses_Distribution', 'Uses_Food', 'Uses_Worship', 'Uses_Medical']


In [6]:
def printVariables():
    for i in range(len(X_column)):
        print('x{}: {}'.format(i+1, X_column[i]))
        

## Linear Regression

In [7]:
# Statesmodel Version of LR
# Traning Data

X_E = df_BU_training_E[X_column]
Y_E = df_BU_training_E['Y_Electric']

X_G = df_BU_training_G[X_column]
Y_G = df_BU_training_G['Y_Gas']


# Testing Data
X_E_test = df_BU_testing_E[X_column]
Y_E_test = df_BU_testing_E['Y_Electric']

X_G_test = df_BU_testing_G[X_column]
Y_G_test = df_BU_testing_G['Y_Gas']

print('BU_data shape')
print(df_BU_analysis_E.shape)
print(df_BU_analysis_G.shape)
print(df_BU_analysis_E_shuffle.shape)
print(df_BU_analysis_G_shuffle.shape)
print()
print('Training Data Shape:')
print(X_E.shape)
print(X_G.shape)
print()
print(Y_E.shape)
print(Y_G.shape)
print()
print('Testing Data Shape:')
print(X_E_test.shape)
print(X_G_test.shape)
print()
print(Y_E_test.shape)
print(Y_G_test.shape)


BU_data shape
(1512, 16)
(1404, 16)
(1512, 16)
(1404, 16)

Training Data Shape:
(1360, 14)
(1263, 14)

(1360,)
(1263,)

Testing Data Shape:
(152, 14)
(141, 14)

(152,)
(141,)


In [8]:
# Run the regression:
version = 'statsModel'
# version = 'sklearn'


if version == 'statsModel':
    # Electricity
    x_e = X_E.values
    y_e = Y_E.values
    x_eTest = X_E_test.values
    y_eTest = Y_E_test.values
    # OLS regression
    x_e = sm.add_constant(x_e)
    model_e = sm.OLS(y_e,x_e)
    results_e = model_e.fit()
    
    
    # Gas
    x_g = X_G.values
    y_g = Y_G.values
    x_gTest = X_G_test.values
    y_gTest = Y_G_test.values
    # OLS regression 
    x_g = sm.add_constant(x_g)
    model_g = sm.OLS(y_g,x_g)
    results_g = model_g.fit()
    
else:
    
    # Using Sklearn linear model to do the prediction
    linearModel_E = LinearRegression(fit_intercept=True, normalize = True)
    linearModel_G = LinearRegression(fit_intercept=True, normalize = True)


    linearModel_E.fit(X_E, Y_E)
    linearModel_G.fit(X_G, Y_G)

    # linear regression accuracy through cross validation
    score_E = cross_val_score(linearModel_E, X_E, Y_E, cv=3)
    score_linear_mean_E = score_E.mean()
    score_G = cross_val_score(linearModel_G, X_G, Y_G, cv=3)
    score_linear_mean_G = score_G.mean()



    np.set_printoptions(suppress=True)

    print('E: Parameters')
    print(linearModel_E.coef_.shape)
    print(linearModel_E.intercept_)

    print()
    print('G: Parameters')
    print(linearModel_G.coef_.shape)
    print(linearModel_G.intercept_)


    coef_E = linearModel_E.coef_
    coef_G = linearModel_G.coef_

    coef_E = coef_E.reshape(coef_E.shape[0], 1)
    coef_G = coef_G.reshape(coef_G.shape[0], 1)

    df_coef_E = pd.DataFrame(coef_E.T, columns = X_column)
    df_coef_G = pd.DataFrame(coef_G.T, columns = X_column)

    df_coef_E['Const'] = linearModel_E.intercept_
    df_coef_G['Const'] = linearModel_G.intercept_


In [9]:
if version == 'sklearn':
    df_coef_E
else:
    printVariables()
    print()
    print(results_e.summary())

x1: Temperature
x2: WI
x3: GHGI
x4: Age
x5: Uses_College
x6: Uses_Sports
x7: Uses_ResidentHousing
x8: Uses_Laboratory
x9: Uses_Office
x10: Uses_Hotel
x11: Uses_Distribution
x12: Uses_Food
x13: Uses_Worship
x14: Uses_Medical

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.805
Model:                            OLS   Adj. R-squared:                  0.803
Method:                 Least Squares   F-statistic:                     397.0
Date:                Tue, 01 May 2018   Prob (F-statistic):               0.00
Time:                        14:20:07   Log-Likelihood:                -2557.2
No. Observations:                1360   AIC:                             5144.
Df Residuals:                    1345   BIC:                             5223.
Df Model:                          14                                         
Covariance Type:            nonrobust                           

In [10]:
if version == 'sklearn':
    df_coef_G
else:
    printVariables()
    print()
    print(results_g.summary())

x1: Temperature
x2: WI
x3: GHGI
x4: Age
x5: Uses_College
x6: Uses_Sports
x7: Uses_ResidentHousing
x8: Uses_Laboratory
x9: Uses_Office
x10: Uses_Hotel
x11: Uses_Distribution
x12: Uses_Food
x13: Uses_Worship
x14: Uses_Medical

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.691
Model:                            OLS   Adj. R-squared:                  0.688
Method:                 Least Squares   F-statistic:                     214.7
Date:                Tue, 01 May 2018   Prob (F-statistic):          4.74e-307
Time:                        14:20:07   Log-Likelihood:                -3490.7
No. Observations:                1263   AIC:                             7009.
Df Residuals:                    1249   BIC:                             7081.
Df Model:                          13                                         
Covariance Type:            nonrobust                           

In [11]:
# Computing MSE
np.set_printoptions(suppress=True)

#predicted comments according to wights
def to_pred(f_coef,f_data):
    # feature dot product with weights
    # feature f_data from testing group
    # weight f_coef from training
    # predicted comments 
    constant = f_coef.loc[0,'Const']
    f_coef.drop(['Const'], axis = 1, inplace= True)
    f_coef = f_coef.T
    pred = np.dot(f_data,f_coef)
    pred = pred + constant
    return pred


if version == 'sklearn':
    #prediction of testing group 
    y_pred_E = to_pred(df_coef_E,X_E_test)
    y_pred_G = to_pred(df_coef_G,X_G_test)


    #true value of testing group
    y_true_E = Y_E_test
    y_true_G = Y_G_test

    #mean squared error
    print('---------------Mean Suared Error of Electric---------------')
    print(mean_squared_error(y_true_E, y_pred_E))
    print('---------------Mean Suared Error of Gas---------------')
    print(mean_squared_error(y_true_G, y_pred_G))
    
else:
    
    # Electricity
    print('Electricity:')
    const_e = results_e.params[0]
    params_e = results_e.params[1:]
    params_e = np.reshape(params_e, (params_e.shape[0], 1))

    print('Constant:')
    print(const_e)
    print()
    print('Parameters:')
    print(params_e)
    print() 

    # Y_test
    Y_hat_e = np.dot(x_eTest, params_e) + const_e
    print('MSE:')
    print(mean_squared_error(y_eTest, Y_hat_e))
    
    print()
    print('------------------------------')
    print()
    
    # Gas
    print('Gas:')
    const_g = results_g.params[0]
    params_g = results_g.params[1:]
    params_g = np.reshape(params_g, (params_g.shape[0], 1))

    print('Constant:')
    print(const_g)
    print()
    print('Parameters:')
    print(params_g)
    print() 

    # Y_test
    Y_hat_g = np.dot(x_gTest, params_g) + const_g
    print('MSE:')
    print(mean_squared_error(y_gTest, Y_hat_g))
       

Electricity:
Constant:
-48.5370033385

Parameters:
[[  0.0134201 ]
 [  0.0186149 ]
 [  3.84215552]
 [  0.01673605]
 [ 16.40494287]
 [ 16.26467472]
 [ 14.9730736 ]
 [ 19.73691169]
 [ 16.74386715]
 [ 16.43865217]
 [ 15.67306312]
 [ 16.78455942]
 [ 14.75130537]
 [ 17.90513301]]

MSE:
3.67213156215

------------------------------

Gas:
Constant:
53.838274936

Parameters:
[[ -0.18846923]
 [ -0.37682062]
 [ 12.29089102]
 [ -0.02607116]
 [  5.3651161 ]
 [  5.12145345]
 [  8.49172109]
 [  0.38197198]
 [  5.72281296]
 [  6.07396103]
 [  6.27325547]
 [  4.72733824]
 [  7.85575979]
 [  3.82488483]]

MSE:
12.7600393005


In [12]:
# Regularized Version of Linear Regression

# TBD...

In [13]:
# print('Accuracy of linear model on electricity is',score_linear_mean_E)
# print('Accuracy of linear model on gas is', score_linear_mean_G)

## Multi-class Logistic Regression

In [36]:
# Y_E_logi = df_BU_training_E['Y_E_Level']
# Y_G_logi = df_BU_training_G['Y_G_Level']

# Y_E_test_logi = df_BU_testing_E['Y_E_Level']
# Y_G_test_logi = df_BU_testing_G['Y_G_Level']

# # Electricity
# x_e_logi = X_E.values
# y_e_logi = Y_E_logi.values
# x_eTest_logi = X_E_test.values
# y_eTest_logi = Y_E_test_logi.values

# # Multinomial logistic regression 
# x_e_logi = sm.add_constant(x_e_logi)

# model_e_logi = sm.MNLogit(y_e_logi,x_e_logi)
# results_e_logi = model_e_logi.fit(method = 'bfgs')


# # Gas
# x_g_logi = X_G.values
# y_g_logi = Y_G_logi.values
# x_gTest_logi = X_G_test.values
# y_gTest_logi = Y_G_test_logi.values

# # Multinomial logistic regression 
# x_g_logi = sm.add_constant(x_g_logi)

# model_g_logi = sm.MNLogit(y_g_logi,x_g_logi)
# results_g_logi = model_g_logi.fit(method = 'bfgs')




In [37]:
# printVariables()
# print()
# results_e_logi.summary2()

In [38]:
# printVariables()
# print()
# results_g_logi.summary2()

In [None]:
# # Run Logistic Regression
# # df_BU_analysis_E
# # df_BU_analysis_G


# # Traning Data
# X_E = df_BU_training_E[X_column]
# X_G = df_BU_training_G[X_column]

# Y_E = df_BU_training_E['Y_E_Level']
# Y_G = df_BU_training_G['Y_G_Level']


# # Testing Data
# X_E_test = df_BU_testing_E[X_column]
# X_G_test = df_BU_testing_G[X_column]

# Y_E_test = df_BU_testing_E['Y_E_Level']
# Y_G_test = df_BU_testing_G['Y_G_Level']


# logr_model_E = LogisticRegression()
# logr_model_G = LogisticRegression()

# logr_model_E.fit(X_E, Y_E)
# logr_model_G.fit(X_G, Y_G)

# score_E = cross_val_score(logr_model_E, X_E, Y_E, cv=3)
# score_G = cross_val_score(logr_model_G, X_G, Y_G, cv=3)

# score_logr_mean_E = score_E.mean()
# score_logr_mean_G = score_G.mean()

# print('Accuracy of logistic model on electricity is',score_logr_mean_E)
# print('Accuracy of logistic model on gas is', score_logr_mean_G)

# print('------------------------------------------------')

# logr_E_coef = logr_model_E.coef_
# logr_G_coef = logr_model_G.coef_
# print(logr_E_coef[0, :])
# # print(logr_G_coef.shape)

## Multi-class SVM

In [None]:
# Run SVM


## Neural Networks

In [None]:
# Run NN
