In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

%matplotlib inline

In [2]:
df = pd.read_csv('../cleaned_data.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,yr,holiday,workingday,temp,atemp,hum,windspeed,cnt,season_1,...,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weathersit_1,weathersit_2,weathersit_3,weathersit_4
0,0,0,0,0,0.24,0.2879,0.81,0.0,16,1,...,0,0,0,0,0,1,1,0,0,0
1,1,0,0,0,0.22,0.2727,0.8,0.0,40,1,...,0,0,0,0,0,1,1,0,0,0
2,2,0,0,0,0.22,0.2727,0.8,0.0,32,1,...,0,0,0,0,0,1,1,0,0,0
3,3,0,0,0,0.24,0.2879,0.75,0.0,13,1,...,0,0,0,0,0,1,1,0,0,0
4,4,0,0,0,0.24,0.2879,0.75,0.0,1,1,...,0,0,0,0,0,1,1,0,0,0


In [4]:
X = df.drop('cnt', axis=1)

In [5]:
y = df['cnt']

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3, random_state=42)

In [7]:
reg = LinearRegression(copy_X= True, fit_intercept= True, normalize= False)

In [8]:
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [9]:
y_pred = reg.predict(X_test)

In [10]:
from sklearn.metrics import mean_absolute_error

print("R^2: {}".format(reg.score(X_test, y_test)))
rmae = mean_absolute_error(y_test, y_pred)
print("Root Mean Absolute Error: {}".format(rmae))

R^2: 0.6805039332293561
Root Mean Absolute Error: 74.4835346796627


In [11]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, y_pred)

10093.149370668183

In [12]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

Root Mean Squared Error: 100.46466727495883


In [13]:
from sklearn.model_selection import cross_val_score

reg = LinearRegression()
cv_results = cross_val_score(reg, X, y, cv=5)

In [14]:
print(cv_results)

[0.22831278 0.66313826 0.52767583 0.64459423 0.63161075]


In [15]:
np.mean(cv_results)

0.5390663686876873

plt.scatter(y_test=df['cnt'], X_train=df['temp'], color='blue', s=50, alpha=.5)
X_plot = sp.linspace(min(dados['temp']), max(df['temp']), len(df['temp']))
plt.plot(X_plot, X_plot*reg.params[1] + reg.params[0], color='r')
plt.ylim(-11,16)
plt.xlim(-2.5,3)
plt.title('Reta de regressão')
plt.ylabel('$y$ - Variável Dependente')
plt.xlabel('$x1$ - Preditor')
plt.show()


In [16]:
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)
reg = LinearRegression()
parameters = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
grid = GridSearchCV(reg,parameters, cv=5)
grid.fit(X_train, y_train)
print("r2 / variance : ", grid.best_score_)
print("Residual sum of squares: %.2f"
            % np.mean((grid.predict(X_test) - y_test) ** 2))
print(grid.best_params_)

r2 / variance :  0.6844155253070047
Residual sum of squares: 10093.15
{'copy_X': True, 'fit_intercept': True, 'normalize': False}
