In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats

data = pd.read_excel("Clean_Data.xlsx")

In [2]:
data.head()

Unnamed: 0,County,totalItems,People,State,Population,medHousInc,Bachelor,MarketShare,FullName,Coastline,Sea
0,Accomack County,6859,358,VA,32412,42879,18.5,0.0034,Virginia,112,True
1,Adams County,18,1,CO,511868,70199,23.6,0.0798,Colorado,0,True
2,Adams County,3792,43,PA,102811,62877,22.4,0.0051,Pennsylvania,0,True
3,Alachua County,131,16,FL,269956,51026,42.5,0.046,Florida,1350,True
4,Alameda County,360845,18586,CA,1666753,101744,46.0,0.0813,California,840,True


In [3]:
import sklearn.metrics as metrics
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))
    
#Transfer category variable to dummy variable
def add_dummy(dataset, columns):
    for i in columns:
        dataset_dummy = pd.get_dummies(dataset[i], prefix=i, drop_first=True)
        dataset = pd.concat([dataset, dataset_dummy], axis=1)
    return dataset
X = data[['Population', 'medHousInc','Bachelor', 'MarketShare','Coastline']]
y = data['totalItems']
"""
X = add_dummy(X,['State'])
X = X.drop('State',axis=1)
"""


#Standardization
#X[['Population', 'medHousInc']] = preprocessing.scale(X[['Population', 'medHousInc']])


#Cross Vaildation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#Linear Model
reg = LinearRegression()
reg.fit(X_train,y_train)
y_predict = reg.predict(X_test)
reg.score(X_test, y_test)

est = smf.ols("totalItems~ Population + medHousInc + Bachelor + MarketShare + Coastline",data=data).fit()
print(est.summary())
y_predict = est.predict(data)
print(f"MAE: {metrics.mean_absolute_error(data.totalItems, y_predict)}")
print(f"RMSE: {metrics.mean_squared_error(data.totalItems, y_predict)}")






                            OLS Regression Results                            
Dep. Variable:             totalItems   R-squared:                       0.272
Model:                            OLS   Adj. R-squared:                  0.268
Method:                 Least Squares   F-statistic:                     57.99
Date:                Fri, 28 Feb 2020   Prob (F-statistic):           2.71e-51
Time:                        11:56:11   Log-Likelihood:                -9672.2
No. Observations:                 781   AIC:                         1.936e+04
Df Residuals:                     775   BIC:                         1.938e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept   -6550.5237   7809.587     -0.839      

In [4]:
est = smf.ols("People ~ Population + medHousInc + Bachelor + MarketShare + Coastline",data=data).fit()
print(est.summary())
y_predict = est.predict(data)
print(f"MAE: {metrics.mean_absolute_error(data.totalItems, y_predict)}")
print(f"RMSE: {metrics.mean_squared_error(data.totalItems, y_predict)}")

                            OLS Regression Results                            
Dep. Variable:                 People   R-squared:                       0.387
Model:                            OLS   Adj. R-squared:                  0.383
Method:                 Least Squares   F-statistic:                     97.96
Date:                Fri, 28 Feb 2020   Prob (F-statistic):           5.18e-80
Time:                        11:56:11   Log-Likelihood:                -7112.4
No. Observations:                 781   AIC:                         1.424e+04
Df Residuals:                     775   BIC:                         1.426e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept    -355.1081    294.557     -1.206      

In [5]:
est = smf.ols("People ~ Population + Population*Sea+medHousInc+medHousInc*Sea+Bachelor+Bachelor*Sea+MarketShare+MarketShare*Sea+Coastline+Sea",data=data).fit()
print(est.summary())
y_predict = est.predict(data)
print(f"MAE: {metrics.mean_absolute_error(data.totalItems, y_predict)}")
print(f"RMSE: {metrics.mean_squared_error(data.totalItems, y_predict)}")

                            OLS Regression Results                            
Dep. Variable:                 People   R-squared:                       0.387
Model:                            OLS   Adj. R-squared:                  0.383
Method:                 Least Squares   F-statistic:                     97.96
Date:                Fri, 28 Feb 2020   Prob (F-statistic):           5.18e-80
Time:                        11:56:11   Log-Likelihood:                -7112.4
No. Observations:                 781   AIC:                         1.424e+04
Df Residuals:                     775   BIC:                         1.426e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                -

In [6]:
from sklearn import svm
clf = svm.SVR(gamma='scale')
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
y_train_predict = clf.predict(X_train)
print("In train set:")
print(f"R square: {metrics.r2_score(y_train, y_train_predict)}")
print(f"MAE: {metrics.mean_absolute_error(y_train, y_train_predict)}")
print(f"RMSE: {metrics.mean_squared_error(y_train, y_train_predict)}")
print(f"Explained variance score: {metrics.explained_variance_score(y_train,y_train_predict)}")
print("In test set:")
print(f"R square: {metrics.r2_score(y_test, y_predict)}")
print(f"MAE: {metrics.mean_absolute_error(y_test, y_predict)}")
print(f"RMSE: {metrics.mean_squared_error(y_test, y_predict)}")
print(f"Explained variance score: {metrics.explained_variance_score(y_test,y_predict)}")



In train set:
R square: -0.09114826056712766
MAE: 22295.969041130367
RMSE: 5340700539.305106
Explained variance score: 0.00021525629942620306
In test set:
R square: -0.10976609866911935
MAE: 21925.505121432674
RMSE: 4336511140.313254
Explained variance score: 0.00028791378217007946


In [7]:
from sklearn import svm
clf = svm.SVR(kernel='linear', gamma='scale')
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
y_train_predict = clf.predict(X_train)
print("In train set:")
print(f"R square: {metrics.r2_score(y_train, y_train_predict)}")
print(f"MAE: {metrics.mean_absolute_error(y_train, y_train_predict)}")
print(f"RMSE: {metrics.mean_squared_error(y_train, y_train_predict)}")
print(f"Explained variance score: {metrics.explained_variance_score(y_train,y_train_predict)}")
print("In test set:")
print(f"R square: {metrics.r2_score(y_test, y_predict)}")
print(f"MAE: {metrics.mean_absolute_error(y_test, y_predict)}")
print(f"RMSE: {metrics.mean_squared_error(y_test, y_predict)}")
print(f"Explained variance score: {metrics.explained_variance_score(y_test,y_predict)}")

In train set:
R square: 0.13375989082369932
MAE: 23024.313255403376
RMSE: 4239872055.3713155
Explained variance score: 0.15394637711791748
In test set:
R square: 0.4078627008096518
MAE: 20821.735129229313
RMSE: 2313829912.0989356
Explained variance score: 0.4236843785467692


In [8]:
from sklearn import svm
clf = svm.SVR(kernel='poly', degree=3, gamma='scale')
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
y_train_predict = clf.predict(X_train)
print("In train set:")
print(f"R square: {metrics.r2_score(y_train, y_train_predict)}")
print(f"MAE: {metrics.mean_absolute_error(y_train, y_train_predict)}")
print(f"RMSE: {metrics.mean_squared_error(y_train, y_train_predict)}")
print(f"Explained variance score: {metrics.explained_variance_score(y_train,y_train_predict)}")
print("In test set:")
print(f"R square: {metrics.r2_score(y_test, y_predict)}")
print(f"MAE: {metrics.mean_absolute_error(y_test, y_predict)}")
print(f"RMSE: {metrics.mean_squared_error(y_test, y_predict)}")
print(f"Explained variance score: {metrics.explained_variance_score(y_test,y_predict)}")

In train set:
R square: -0.027243109343562555
MAE: 21832.434275058444
RMSE: 5027912361.989333
Explained variance score: 0.0542171310598335
In test set:
R square: -1.922930613639208
MAE: 25219.47951294713
RMSE: 11421615044.476387
Explained variance score: -1.8970929515642814


In [9]:
from sklearn import tree
clf = tree.DecisionTreeRegressor(max_depth = 10)
clf = clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
y_train_predict = clf.predict(X_train)
print("In train set:")
print(f"R square: {metrics.r2_score(y_train, y_train_predict)}")
print(f"MAE: {metrics.mean_absolute_error(y_train, y_train_predict)}")
print(f"RMSE: {metrics.mean_squared_error(y_train, y_train_predict)}")
print(f"Explained variance score: {metrics.explained_variance_score(y_train,y_train_predict)}")
print("In test set:")
print(f"R square: {metrics.r2_score(y_test, y_predict)}")
print(f"MAE: {metrics.mean_absolute_error(y_test, y_predict)}")
print(f"RMSE: {metrics.mean_squared_error(y_test, y_predict)}")
print(f"Explained variance score: {metrics.explained_variance_score(y_test,y_predict)}")

In train set:
R square: 0.9750321978798331
MAE: 4367.096284349207
RMSE: 122206632.28581975
Explained variance score: 0.9750321978798331
In test set:
R square: -0.8654092226432795
MAE: 30787.510381044565
RMSE: 7289254812.285937
Explained variance score: -0.856382299994584


In [10]:
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor(max_depth = 5)
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
y_train_predict = clf.predict(X_train)
print("In train set:")
print(f"R square: {metrics.r2_score(y_train, y_train_predict)}")
print(f"MAE: {metrics.mean_absolute_error(y_train, y_train_predict)}")
print(f"RMSE: {metrics.mean_squared_error(y_train, y_train_predict)}")
print(f"Explained variance score: {metrics.explained_variance_score(y_train,y_train_predict)}")
print("In test set:")
print(f"R square: {metrics.r2_score(y_test, y_predict)}")
print(f"MAE: {metrics.mean_absolute_error(y_test, y_predict)}")
print(f"RMSE: {metrics.mean_squared_error(y_test, y_predict)}")
print(f"Explained variance score: {metrics.explained_variance_score(y_test,y_predict)}")

In train set:
R square: 0.6993007312155701
MAE: 18449.94314559659
RMSE: 1471793343.7670162
Explained variance score: 0.6993269779292814
In test set:
R square: 0.3780078986479156
MAE: 24798.90551460715
RMSE: 2430490244.6874676
Explained variance score: 0.3820092449073301


