# Linear Regression


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_validate, RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
import pickle

In [2]:
data=pd.read_csv('mappingDunas.csv', header=0)

print(data.head())

   obj         x         y         w         h      perc        lat      long
0    2  0.410373  0.229552  0.020399  0.028549  0.705900  40.629777 -8.746839
1    2  0.420139  0.195988  0.015625  0.021605  0.722931  40.629997 -8.746807
2    2  0.694444  0.626157  0.098958  0.138117  0.818434  40.629226 -8.746824
3    2  0.446398  0.282407  0.029080  0.043210  0.855846  40.629575 -8.746841
4    2  0.421224  0.194830  0.015191  0.022377  0.673168  40.630009 -8.746807


In [3]:
data.describe()
data=data.values

In [24]:
print(data.shape[1])

X = data[:, 1:3]
#X = data[:, 1:5]
y = data[:, 6:8]

print(X)
print(X.shape)

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.15)

print(X_train.shape)


8
[[0.410373 0.229552]
 [0.420139 0.195988]
 [0.694444 0.626157]
 [0.446398 0.282407]
 [0.421224 0.19483 ]
 [0.409071 0.225694]
 [0.658854 0.576775]
 [0.442274 0.277006]
 [0.421441 0.192515]
 [0.407769 0.221836]
 [0.908637 0.955633]
 [0.633464 0.534722]
 [0.438585 0.270062]
 [0.421658 0.19213 ]
 [0.406467 0.220293]
 [0.61263  0.497685]
 [0.434462 0.262346]
 [0.867405 0.910108]
 [0.421658 0.19213 ]
 [0.406467 0.220293]
 [0.61263  0.497685]
 [0.434462 0.262346]
 [0.867405 0.910108]
 [0.423611 0.189429]
 [0.591146 0.466049]
 [0.405816 0.218364]
 [0.43099  0.257716]
 [0.818793 0.856481]
 [0.571615 0.442901]
 [0.40408  0.215278]
 [0.427951 0.253472]
 [0.771267 0.768519]
 [0.402778 0.21142 ]
 [0.552734 0.413966]
 [0.424913 0.247299]
 [0.726562 0.697531]
 [0.434679 0.349923]
 [0.525174 0.575231]
 [0.442708 0.368441]
 [0.559896 0.65625 ]
 [0.450955 0.388889]
 [0.603082 0.757716]
 [0.459852 0.411265]
 [0.656033 0.86304 ]
 [0.975477 0.829861]
 [0.709852 0.935957]
 [0.472656 0.441358]
 [0.947266 

In [17]:
def featureNormalization(X):
    """
    Take in numpy array of X values and return normalize X values,
    the mean and standard deviation of each feature
    """
    mean = np.mean(X, axis=0)
    std = np.std(X, axis = 0)
    
    X_norm = (X - mean)/std
    
    return X_norm , mean , std

In [18]:
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)

In [27]:
def model_evalutaion(model, X, y, n_splits):
    
    cv = RepeatedKFold(n_splits=n_splits, n_repeats=10, random_state=1)
    
    scores = cross_validate(model, X_train, y_train, cv=cv, return_train_score=True)

    print(scores['test_score'].mean())

    print(scores['train_score'].mean())

In [28]:
model = LinearRegression()

model_evalutaion(model, X_train, y_train, 4)


0.7289610599656776
0.7830842650989314


In [29]:
model2 = Ridge()

grid = {'alpha': [0.001, 0.01 ,0.02, 0.05, 0.1, 0.2, 0.5, 0.7, 1, 3 , 10, 30, 40, 50, 100]}

clf = GridSearchCV(model2,grid, cv=4)

clf.fit(X_train, y_train)

print(clf.best_params_)


{'alpha': 0.01}


In [22]:
def test_model(model, X_test, y_test):
    
    predictions = model.predict(X_test)
    
    for i in range(len(predictions)):
        print("X=%s, Predicted=%s, equal=%s" % (predictions[i], y_test[i], (predictions[i]==y_test[i]) ))

In [32]:
model3 = Ridge(alpha=0.01)


model_evalutaion(clf, X_train, y_train, 4)
model3.fit(X_train, y_train)

test_model(clf, X_test, y_test )

0.7288688223529517
0.782652606814152
X=[40.62932332 -8.74688223], Predicted=[40.62924  -8.746874], equal=[False False]
X=[40.62957703 -8.74685719], Predicted=[40.629396 -8.746883], equal=[False False]
X=[40.6297119  -8.74682278], Predicted=[40.629997 -8.746807], equal=[False False]
X=[40.62969405 -8.7468348 ], Predicted=[40.629789 -8.746839], equal=[False False]
X=[40.62966262 -8.74683397], Predicted=[40.629632 -8.746831], equal=[False False]
X=[40.62897774 -8.74681589], Predicted=[40.629176 -8.74683 ], equal=[False False]
X=[40.62969075 -8.74683524], Predicted=[40.629777 -8.746839], equal=[False False]
X=[40.62948182 -8.74686493], Predicted=[40.629344 -8.746883], equal=[False False]
X=[40.62908296 -8.74682741], Predicted=[40.62922  -8.746827], equal=[False False]
X=[40.6297035 -8.7468342], Predicted=[40.629866 -8.746832], equal=[False False]
X=[40.62904249 -8.74676699], Predicted=[40.629206 -8.746771], equal=[False False]
X=[40.62912145 -8.74676345], Predicted=[40.629214 -8.746771], e

In [15]:
model4 = MultiOutputRegressor(SVR())

model_evalutaion(model4, X_train, y_train, 4)

-1.0012908954452915
-0.7376644971004629


In [71]:
def save_model(model, filename):
    
    # para obter os coeficicientes aprendidos durante o treino model.coef__
    
    with open(filename, 'wb') as file:
        pickle.dump(model, file)
    

In [72]:
def load_model(filename):
    with open(filename, 'rb') as file:
       return pickle.load(file)

In [25]:
filename = "ridge.pkl"

save_model(clf, filename)

pkl_model = load_model(filename)

model_evalutaion(clf, X_train, y_train, 5)

#test_model(pkl_model, X_test, y_test )

NameError: name 'save_model' is not defined