# Regressão Linear Múltipla

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
    
from functools import wraps

## Carregando Base de Dados

In [2]:
!git clone https://github.com/Crissky/MLUD.git

Cloning into 'MLUD'...
remote: Enumerating objects: 90, done.[K
remote: Counting objects: 100% (90/90), done.[K
remote: Compressing objects: 100% (81/81), done.[K
remote: Total 90 (delta 32), reused 29 (delta 4), pack-reused 0[K
Unpacking objects: 100% (90/90), done.


## Funções de Pré-Processamento

In [3]:
def loadDataset(filename):
    baseDeDados = pd.read_csv(filename, delimiter=';')
    X = baseDeDados.iloc[:,:-1].values
    y = baseDeDados.iloc[:,-1].values
    return X, y

def fillMissingData(X, inicioColuna, fimColuna):
    imputer = SimpleImputer(missing_values=np.nan, strategy='median')
    X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1])
    return X

#só funciona se i = 0 ou i = ultima coluna
def computeCategorization(X, i):
    labelencoder_X = LabelEncoder()
    X[:, i] = labelencoder_X.fit_transform(X[:, i])

    #one hot encoding
    D = pd.get_dummies(X[:,i]).values
    if(i == 0):
        X = X[:,1:]
        X = np.insert(X, 0, D, axis=1)

        #removendo dummy variable trap
        X = X[:,1:]
    else:
        X = X[:,:i]
        for j in range(0, D.shape[1]):
            X = np.insert(X, i, D[:,j], axis=1)

        #removendo dummy variable trap
        X = X[:,:-1]
    return X

def splitTrainTestSets(X, y, testSize):
    XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize)
    return XTrain, XTest, yTrain, yTest

def computeScaling(train, test):
    scaleX = StandardScaler()
    train = scaleX.fit_transform(train)
    test = scaleX.fit_transform(test)
    return train, test

## Regressão Linear

In [4]:
def computeLinearRegressionModel(XTrain, yTrain, XTest, yTest):
    regressor = LinearRegression()
    regressor.fit(XTrain, yTrain)
    #yPred = regressor.predict(XTest)

    #gerar grafico
    
    # plt.scatter(XTest[:,-1], yTest, color="red")
    # plt.plot(XTest[:,-1], regressor.predict(XTest), color="blue")
    # plt.title("Inscritos x Visualizações (SVBR)")
    # plt.xlabel("Total de Inscritos")
    # plt.ylabel("Total de Visualizações")
    # plt.show()

def runLinearRegressionExample(filename):
    start_time = time.time()
    X, y = loadDataset(filename)
    elapsed_time = time.time() - start_time
    print("Load Dataset: %.2f" % elapsed_time, "segundos.")

    start_time = time.time()
    X = fillMissingData(X, 1, X.shape[1])
    elapsed_time = time.time() - start_time
    print("Fill Missing Data: %.2f" % elapsed_time, "segundos.")

    start_time = time.time()
    X = computeCategorization(X, 0)
    elapsed_time = time.time() - start_time
    print("Compute Categorization: %.2f" % elapsed_time, "segundos.")

    start_time = time.time()
    XTrain, XTest, yTrain, yTest = splitTrainTestSets(X, y, 0.8)
    elapsed_time = time.time() - start_time
    print("Split Train Test sets: %.2f" % elapsed_time, "segundos.")

    start_time = time.time()
    computeLinearRegressionModel(XTrain, yTrain, XTest, yTest)
    elapsed_time = time.time() - start_time
    print("Compute Linear Regression: %.2f" % elapsed_time, "segundos.")

In [5]:
runLinearRegressionExample('MLUD/Aula05/svbr.csv')

Load Dataset: 0.02 segundos.
Fill Missing Data: 0.00 segundos.
Compute Categorization: 0.00 segundos.
Split Train Test sets: 0.00 segundos.
Compute Linear Regression: 0.00 segundos.


## Regressão Linear Múltipla

In [6]:
def computeMultipleLinearRegressionModel(XTrain, yTrain, XTest, yTest):
    regressor = LinearRegression()
    regressor.fit(XTrain, yTrain)
    yPred = regressor.predict(XTest)

    print('\n#########################INÍCIO#########################')
    print('\nIniciando comparação:\nPREDITO | ESPERADO | DIFERENÇA\n')
    for i in range(0, yPred.shape[0]):
      print(yPred[i], yTest[i], abs(yPred[i] - yTest[i]))
      #time.sleep(1)
    print('\n##########################FIM##########################\n')

def runMultipleLinearRegressionExemple(filename):
    start_time = time.time()
    X, y = loadDataset(filename)
    elapsed_time = time.time() - start_time
    print("Load Dataset: %.2f" % elapsed_time, "segundos.")

    start_time = time.time()
    X = fillMissingData(X, 1, 2)
    elapsed_time = time.time() - start_time
    print("Fill Missing Data: %.2f" % elapsed_time, "segundos.")

    start_time = time.time()
    X = computeCategorization(X, 3)
    elapsed_time = time.time() - start_time
    print("Compute Categorization: %.2f" % elapsed_time, "segundos.")

    start_time = time.time()
    XTrain, XTest, yTrain, yTest = splitTrainTestSets(X, y, 0.8)
    elapsed_time = time.time() - start_time
    print("Split Train Test sets: %.2f" % elapsed_time, "segundos.")

    start_time = time.time()
    computeMultipleLinearRegressionModel(XTrain, yTrain, XTest, yTest)
    elapsed_time = time.time() - start_time
    print("Compute Multiple Linear Regression: %.2f" % elapsed_time, "segundos.")

In [7]:
runMultipleLinearRegressionExemple('MLUD/Aula05/insurance.csv')

Load Dataset: 0.01 segundos.
Fill Missing Data: 0.00 segundos.
Compute Categorization: 0.00 segundos.
Split Train Test sets: 0.00 segundos.

#########################INÍCIO#########################

Iniciando comparação:
PREDITO | ESPERADO | DIFERENÇA

15534.023314541555 24513.09126 8979.067945458446
9052.330941579037 2534.39375 6517.937191579037
9706.32230633144 23288.9284 13582.60609366856
18210.108888614468 12648.7034 5561.405488614468
11495.18219024568 3393.35635 8101.8258402456795
10136.684595293216 17904.52705 7767.842454706784
13818.187669440646 7441.053000000001 6377.134669440645
8910.315236281644 1725.5523 7184.762936281643
7382.4020678702145 3378.91 4003.4920678702147
13385.389968114549 7742.1098 5643.280168114548
5258.320403061765 1607.5101 3650.8103030617654
16602.527259030954 13217.0945 3385.432759030955
9394.876209060934 3309.7926 6085.083609060934
11964.927615898701 39611.7577 27646.8300841013
16999.09346189999 10141.1362 6857.957261899988
9010.75976584257 3484.330999999

## Visualizando a Base de Dados

In [8]:
dataset1 = pd.read_csv('MLUD/Aula05/svbr.csv', delimiter=';')
dataset1.head()

Unnamed: 0,Canal,Inscritos,Visualizações
0,Site Arqueologia Egípcia,13438.0,406590
1,Terra Negra,35241.0,868235
2,Frank Jaava,31680.0,2856508
3,Dispersciência,25100.0,150000
4,Olá Ciência,32788.0,1575456


In [9]:
dataset2 = pd.read_csv('MLUD/Aula05/insurance.csv', delimiter=';')
dataset2.head()

Unnamed: 0,age,bmi,children,region,charges
0,19,27.9,0,southwest,16884.924
1,18,33.77,1,southeast,1725.5523
2,28,33.0,3,southeast,4449.462
3,33,22.705,0,northwest,21984.47061
4,32,28.88,0,northwest,3866.8552
