In [10]:
"""
A utility module for quickly implementing common regression algorithms including:

--> Linear Regression (simple, multiple)

--> Polynomial Regression (simple, multiple)

"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model as lr
import statsmodels.api as sm
import statsmodels.stats
from sklearn import metrics

# The function below returns a pandas DataFrame from a given excel sheet given the file path and sheet name


def excelSheetReader(workingFile, sheet):
    """
    Creates a pandas DataFrame from an Excel workbook with file name: workingFile, and worksheet named: sheet.

    Returns: pandas.DataFrame
    """

    #Load the whole spreadsheet
    spreadsheet = pd.ExcelFile(workingFile)

    #Load a specific sheet of the spreadsheet into a DataFrame by name: workingFrame
    workingFrame = spreadsheet.parse(sheet)
    workingFrame = workingFrame.fillna(workingFrame.mean())

    return workingFrame

class Regression():
    
    def __init__(self, feature, target):
        """
        A parent class for quick regression procedures.
        target -> dependent variable (one-dimensional pandas DataFrame)
        feature -> independent variable(s) (m-dimensional pandas DataFrame)
        """
        self.feature = feature
        self.target = target
    
    
class LinearRegression(Regression):
    
    def __init__(self, feature, target):
        """
        A class for quick linear regression procedures.
        target -> dependent variable (one-dimensional pandas DataFrame)
        feature -> independent variable(s) (m-dimensional pandas DataFrame)
        """
        super().__init__(feature, target)
        self.model = None
        #self.intercept = 0
        #self.coefs = {}
    
    #TODO: methods -> fitModel(), getR2(), getIntercept(), getCoefs()
    def fitModel(self):

        # fit model using given data, feature and target
        self.model = lr.LinearRegression().fit(self.feature, self.target)
        feature1 = sm.add_constant(feature)
        self.model1 = sm.OLS(target, feature1).fit()

    def getFeatureNames(self):
        # get the names of the independent variables (features)
        # returns: list containing the feature names
        return list(self.feature.columns)
        pass


    def getR2(self):
        # return the coefficient of determination (r-squared)
        return self.model.score(self.feature, self.target)

    def getIntercept(self):
        # return the model's intercept
        return self.model.intercept_
        
    def getCoefs(self):
        # return the coefficients of the model as a dictionary
        # the feature names (independent variables) are the keys of the dict
        # and their coefficient values are the values of the dict
        # This block is to populate the coef_dict dictionary with the coefficients. The variable names as keys
        # and their corresponding coefficients as values of the dictionary.
        count = 0
        coef_dict = {}
        feature_list = list(self.feature.columns)
        for item in self.model.coef_: # iterate through the coefficients of the model.
            for coef in item:
                #print(coef)
                coef_dict[str(feature_list[count])] = coef
                count += 1

        return coef_dict
    
    def getMetrics(self, asTable=True):
        """Returns a DataFrame representation of the regression summary."""
        if asTable == True:
            summary = self.model1.summary()
            metricsTable1 = summary.tables[0].as_html()
            metricsTable2 = summary.tables[2].as_html()
            coefTable = summary.tables[1].as_html()

            #metricsFrame1_header = ['Metrics:', '', 'OLS Regression Results', '']
            #metricsFrame2_header = ['', '', '', '']
            #coefFrame_header = ['', 'coef',
            #                    'std err', 't', 'P>|t|', '[0.025', '0.975]']
                                    
            metricsFrame1 = pd.read_html(
                metricsTable1, header=0, index_col=0)[0]
            metricsFrame2 = pd.read_html(
                metricsTable2, header=0, index_col=0)[0]
            coefFrame = pd.read_html(
                coefTable, header=0, index_col=0)[0]

            return metricsFrame1, metricsFrame2, coefFrame, metricsFrame1, metricsFrame2, coefFrame
        else:
            return self.model1.summary().as_csv()       
    
#TODO --> --> --> TESTING AREA -->

work_file = "C:\\400L Project\\analysis\\LXP_REG_DATASET.xlsx"
frame = excelSheetReader(work_file, 'Sheet1')

target = frame[['LXP']]
feature = frame.loc[: , 'GOVT_EXP_EDU_GOVEXP': 'PREV_HIV_TOTAL']

myCustom = LinearRegression(feature, target)
myCustom.fitModel()

#print('feature names: ', myCustom.getFeatureNames(), '\n')
print('intercept: ', myCustom.getIntercept(), '\n')

print('coefficients: ')
for k, v in myCustom.getCoefs().items():
    print(k, v)

print('\n')
#print('R2: ', myCustom.getR2())
print('Metrics: ', myCustom.getMetrics(asTable=False))
#print('Metrics: ', myCustom.getMetrics(asTable=True))
#print('Metrics: ', type(myCustom.getMetrics()))

#myCustom.getMetrics().to_csv()
print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
#for item in myCustom.getMetrics():
#    print(item)
#    pass





intercept:  [70.58707488] 

coefficients: 
GOVT_EXP_EDU_GOVEXP -0.046543656501934344
ADOL_FRTLY_RATE -0.010941905201402955
PPLE_BASIC_SANI 0.029312396661727216
RURAL_POPULATION -0.002790300627500528
SCHL_ENRL_SECONDARY 0.004061476634235127
POPULATION_LIVING_SLUMS 0.007116386689491075
MORTALITY_RATE_INFANT -0.21624954742528493
LITERACY_RATE -0.0017975800372121414
PPLE_BASIC_DRINKING_WATER 0.026756237481096106
GCF 0.06634085043240186
PPLE_SQ_SQ.  0.0014373600578663041
PREV_HIV_TOTAL -0.23759249779711802


Metrics:                        OLS Regression Results                     
Dep. Variable:   ,LXP             ,  R-squared:         ,   0.893
Model:           ,OLS             ,  Adj. R-squared:    ,   0.866
Method:          ,Least Squares   ,  F-statistic:       ,   33.92
Date:            ,Wed, 23 Jun 2021,  Prob (F-statistic):,1.39e-19
Time:            ,08:50:22        ,  Log-Likelihood:    , -124.14
No. Observations:,    62          ,  AIC:               ,   274.3
Df Residuals:    , 