# Linear Regression


In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_validate, RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
import pickle

In [43]:
data=pd.read_csv('mappingRia2.csv', header=0)

print(data.head())

   obj        x1        y1        x2        y2      prob        lat      long
0    2  0.549913  0.485339  0.023438  0.030864  0.704711  40.628118 -8.733072
1    2  0.129774  0.636960  0.142361  0.135031  0.819667  40.628362 -8.733523
2    2  0.047309  0.687114  0.093750  0.176698  0.610786  40.628366 -8.733555
3    2  0.555339  0.480324  0.023003  0.031636  0.630246  40.628103 -8.733021
4    2  0.559245  0.476852  0.020399  0.027778  0.536870  40.628083 -8.732968


In [44]:
data.describe()
data=data.values

In [45]:
print(data.shape[1])

X = data[:, 1:3]
#X = data[:, 1:5]
y = data[:, 6:8]

print(X)
print(X.shape)

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.15)

print(X_train.shape)


8
[[0.549913  0.485339 ]
 [0.129774  0.63696  ]
 [0.047309  0.687114 ]
 [0.555339  0.480324 ]
 [0.559245  0.476852 ]
 [0.564019  0.471451 ]
 [0.567057  0.467593 ]
 [0.576389  0.46142  ]
 [0.577908  0.461034 ]
 [0.485894  0.464506 ]
 [0.475911  0.468364 ]
 [0.465712  0.471451 ]
 [0.440321  0.48071  ]
 [0.422743  0.486497 ]
 [0.406467  0.490741 ]
 [0.381293  0.499614 ]
 [0.367622  0.509645 ]
 [0.269748  0.541667 ]
 [0.219618  0.563657 ]
 [0.147786  0.587577 ]
 [0.0659722 0.625386 ]
 [0.155816  0.872685 ]
 [0.208116  0.83179  ]
 [0.317925  0.772762 ]
 [0.45204   0.638889 ]
 [0.480252  0.614583 ]
 [0.503255  0.594522 ]
 [0.52322   0.577932 ]
 [0.53928   0.565972 ]
 [0.554688  0.555556 ]
 [0.569661  0.54591  ]
 [0.298611  0.537809 ]
 [0.279731  0.540895 ]
 [0.250217  0.553241 ]
 [0.202474  0.576775 ]
 [0.147352  0.599151 ]
 [0.0831163 0.624614 ]
 [0.412109  0.490741 ]
 [0.395399  0.496914 ]
 [0.377821  0.501929 ]
 [0.366102  0.509259 ]
 [0.282118  0.533951 ]
 [0.258464  0.546296 ]
 [0.21332

In [46]:
def featureNormalization(X):
    """
    Take in numpy array of X values and return normalize X values,
    the mean and standard deviation of each feature
    """
    mean = np.mean(X, axis=0)
    std = np.std(X, axis = 0)
    
    X_norm = (X - mean)/std
    
    return X_norm , mean , std

In [47]:
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)

In [48]:
def model_evalutaion(model, X, y, n_splits):
    
    cv = RepeatedKFold(n_splits=n_splits, n_repeats=10, random_state=1)
    
    scores = cross_validate(model, X_train, y_train, cv=cv, return_train_score=True)

    print(scores['test_score'].mean())

    print(scores['train_score'].mean())

In [49]:
model = LinearRegression()

model_evalutaion(model, X_train, y_train, 4)


0.7592292654512396
0.7886055844131834


In [50]:
model2 = Ridge()

grid = {'alpha': [0.001, 0.01 ,0.02, 0.05, 0.1, 0.2, 0.5, 0.7, 1, 3 , 10, 30, 40, 50, 100]}

clf = GridSearchCV(model2,grid, cv=5)

clf.fit(X_train, y_train)

print(clf.best_params_)


{'alpha': 0.02}


In [51]:
def test_model(model, X_test, y_test):
    
    print("test ", X_test)
    print(type(X_test))
    
    predictions = model.predict(X_test)
    
    for i in range(len(predictions)):
        print("X=%s, Predicted=%s, equal=%s" % (predictions[i], y_test[i], (predictions[i]==y_test[i]) ))

In [52]:
model3 = Ridge(alpha=0.02)


model_evalutaion(clf, X_train, y_train, 5)
model3.fit(X_train, y_train)

test_model(clf, X_test, y_test )

test_model(clf, np.asarray([[0.61263,  0.497685]]), np.asarray([[40.6297278, -8.74681932]]))

0.757879304072973
0.7876738202235863
test  [[0.428168  0.607253 ]
 [0.258464  0.546296 ]
 [0.456597  0.567515 ]
 [0.129774  0.63696  ]
 [0.429253  0.596451 ]
 [0.503255  0.594522 ]
 [0.499132  0.546296 ]
 [0.532552  0.510802 ]
 [0.520399  0.478009 ]
 [0.0594618 0.805941 ]
 [0.440321  0.48071  ]
 [0.390408  0.565586 ]
 [0.467882  0.511188 ]
 [0.230035  0.765818 ]]
<class 'numpy.ndarray'>
X=[40.62818519 -8.73325312], Predicted=[40.628235 -8.733401], equal=[False False]
X=[40.62828639 -8.73330104], Predicted=[40.628343 -8.733427], equal=[False False]
X=[40.62817466 -8.73320348], Predicted=[40.628226 -8.733373], equal=[False False]
X=[40.62834551 -8.73345142], Predicted=[40.628362 -8.733523], equal=[False False]
X=[40.62818598 -8.73324349], Predicted=[40.62824  -8.733408], equal=[False False]
X=[40.62814552 -8.73319884], Predicted=[40.628191 -8.733342], equal=[False False]
X=[40.62815399 -8.73316106], Predicted=[40.628185 -8.733257], equal=[False False]
X=[40.62814018 -8.73311205], Predict

In [36]:
model4 = MultiOutputRegressor(SVR())

model_evalutaion(model4, X_train, y_train, 4)

-1.1636341683330929
-0.6858364517150991


In [37]:
def save_model(model, filename):
    
    # para obter os coeficicientes aprendidos durante o treino model.coef__
    
    with open(filename, 'wb') as file:
        pickle.dump(model, file)
    

In [38]:
def load_model(filename):
    with open(filename, 'rb') as file:
       return pickle.load(file)

In [53]:
filename = "ponte.pkl"

save_model(clf, filename)

pkl_model = load_model(filename)

model_evalutaion(clf, X_train, y_train, 5)

test_model(clf, np.asarray([[0.61263,  0.497685]]), np.asarray([[40.6297278, -8.74681932]]))

0.757879304072973
0.7876738202235863
test  [[0.61263  0.497685]]
<class 'numpy.ndarray'>
X=[40.62809781 -8.73305455], Predicted=[40.6297278  -8.74681932], equal=[False False]
