<a href="https://colab.research.google.com/github/1exip/Learning-PyTorch/blob/master/BostonLinearRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import io
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler
import torch.nn as nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import torch.nn.functional as F

In [0]:
# Get pandas dataframe from raw csv link
URL = 'https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv'
csv = requests.get(URL).content
housing_df = pd.read_csv(io.StringIO(csv.decode('utf-8')))

In [3]:
#Get a look at the data
housing_df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
#See if there are any null values
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  b        506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [0]:
housing_df['chas'] = housing_df['chas'].astype(float)
housing_df['rad'] = housing_df['rad'].astype(float)
housing_df['tax'] = housing_df['tax'].astype(float)

In [6]:
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    float64
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    float64
 9   tax      506 non-null    float64
 10  ptratio  506 non-null    float64
 11  b        506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


In [7]:
#Create a correlation matrix in order to find what attribute to use for stratified split
correlation = housing_df.corr()
correlation['medv'].sort_values(ascending=False)

medv       1.000000
rm         0.695360
zn         0.360445
b          0.333461
dis        0.249929
chas       0.175260
age       -0.376955
rad       -0.381626
crim      -0.388305
nox       -0.427321
tax       -0.468536
indus     -0.483725
ptratio   -0.507787
lstat     -0.737663
Name: medv, dtype: float64

In [8]:
print(np.max(housing_df['lstat']))
print(np.min(housing_df['lstat']))

37.97
1.73


In [0]:
housing_df['lstat_cat'] = pd.cut(housing_df['lstat'],
                           bins=[1.,5.,10.,15.,20.,25.,30.,35.,np.inf],
                           labels=[1,2,3,4,5,6,7,8])

In [0]:
stratsplit = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in stratsplit.split(housing_df, housing_df['lstat_cat']):
  strat_train = housing_df.loc[train_index]
  strat_test = housing_df.loc[test_index]

In [12]:
print('Stratified Test Set')
print(strat_test['lstat_cat'].value_counts(ascending=False) / len(strat_test))
print('\n')
print('Stratified Train Set')
print(strat_train['lstat_cat'].value_counts(ascending=False) / len(strat_train))

Stratified Test Set
2    0.313725
3    0.245098
4    0.176471
1    0.127451
5    0.078431
6    0.039216
7    0.019608
8    0.000000
Name: lstat_cat, dtype: float64


Stratified Train Set
2    0.309406
3    0.247525
4    0.173267
1    0.121287
5    0.079208
6    0.044554
7    0.019802
8    0.004950
Name: lstat_cat, dtype: float64


In [0]:
for set in (strat_train, strat_test):
  set.drop('lstat_cat',axis=1,inplace=True)

In [14]:
strat_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404 entries, 123 to 273
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     404 non-null    float64
 1   zn       404 non-null    float64
 2   indus    404 non-null    float64
 3   chas     404 non-null    float64
 4   nox      404 non-null    float64
 5   rm       404 non-null    float64
 6   age      404 non-null    float64
 7   dis      404 non-null    float64
 8   rad      404 non-null    float64
 9   tax      404 non-null    float64
 10  ptratio  404 non-null    float64
 11  b        404 non-null    float64
 12  lstat    404 non-null    float64
 13  medv     404 non-null    float64
dtypes: float64(14)
memory usage: 47.3 KB


In [0]:
housing = strat_train.drop('medv',axis=1)
housing_labels = strat_train['medv'].copy()
housing_test = strat_test.drop('medv',axis=1)
housing_test_labels = strat_test['medv'].copy()

In [0]:
scaler = MinMaxScaler()
housing[['crim','zn','indus','chas','nox','rm','age','dis','rad','tax','ptratio','b','lstat']] = scaler.fit_transform(housing[['crim','zn','indus','chas','nox','rm','age','dis','rad','tax','ptratio','b','lstat']])
housing_test[['crim','zn','indus','chas','nox','rm','age','dis','rad','tax','ptratio','b','lstat']] = scaler.fit_transform(housing_test[['crim','zn','indus','chas','nox','rm','age','dis','rad','tax','ptratio','b','lstat']])

In [45]:
housing

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
123,0.001959,0.000000,0.921221,0.0,0.403292,0.439739,0.969104,0.073470,0.043478,0.001908,0.691489,0.932952,0.653422
323,0.003775,0.000000,0.232567,0.0,0.222222,0.411381,0.735324,0.326139,0.173913,0.190840,0.744681,0.985451,0.276214
166,0.027253,0.000000,0.692424,0.0,0.452675,0.836942,0.960865,0.082706,0.173913,0.412214,0.223404,0.930405,0.054360
214,0.003852,0.000000,0.353562,0.0,0.213992,0.354666,0.071061,0.222986,0.130435,0.171756,0.638298,0.879041,0.767660
94,0.000498,0.294737,0.521297,0.0,0.162551,0.515041,0.766220,0.225488,0.130435,0.158397,0.595745,1.000000,0.244481
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,0.001945,0.263158,0.147757,0.0,0.139918,0.417705,0.651905,0.554020,0.304348,0.185115,0.755319,0.995486,0.315121
465,0.042940,0.000000,0.636638,0.0,0.555556,0.421153,0.466529,0.175577,1.000000,0.914122,0.808511,0.842403,0.342163
145,0.032274,0.000000,0.692424,0.0,1.000000,0.492240,1.000000,0.025670,0.173913,0.412214,0.223404,0.435196,0.719371
12,0.001189,0.131579,0.251037,0.0,0.286008,0.446062,0.371782,0.392547,0.173913,0.236641,0.276596,0.983862,0.385762


In [0]:
housing = np.array(housing, dtype='float32')
housing_labels = np.array(housing_labels, dtype='float32')

In [0]:
housing = torch.from_numpy(housing)
housing_labels = torch.from_numpy(housing_labels)

In [0]:
train_ds = TensorDataset(housing, housing_labels)

In [0]:
train_dl = DataLoader(train_ds, batch_size=5, shuffle=True)

In [85]:
for xb, yb in train_dl:
  print(xb)
  print(yb)
  break

tensor([[4.5358e-03, 2.3158e-01, 1.7527e-01, 0.0000e+00, 9.4650e-02, 4.8802e-01,
         3.2956e-01, 6.2956e-01, 2.6087e-01, 2.7290e-01, 6.9149e-01, 9.8306e-01,
         2.0502e-01],
        [9.9908e-04, 4.2105e-01, 1.9600e-01, 0.0000e+00, 1.2757e-01, 5.5969e-01,
         3.0072e-01, 2.7329e-01, 1.3043e-01, 1.2786e-01, 5.3191e-01, 1.0000e+00,
         1.5066e-01],
        [1.9816e-02, 0.0000e+00, 6.9242e-01, 0.0000e+00, 4.5267e-01, 7.5263e-01,
         9.0525e-01, 7.5882e-02, 1.7391e-01, 4.1221e-01, 2.2340e-01, 9.4334e-01,
         0.0000e+00],
        [4.5530e-03, 0.0000e+00, 2.3257e-01, 0.0000e+00, 2.2222e-01, 5.4685e-01,
         3.8311e-01, 3.2614e-01, 1.7391e-01, 1.9084e-01, 7.4468e-01, 1.0000e+00,
         1.2114e-01],
        [7.2174e-03, 0.0000e+00, 1.8809e-01, 0.0000e+00, 2.4486e-01, 4.6369e-01,
         6.7147e-01, 2.3063e-01, 3.0435e-01, 2.2901e-01, 5.1064e-01, 9.5323e-01,
         2.7373e-01]])
tensor([24.3000, 29.1000, 50.0000, 25.0000, 24.3000])


In [86]:
model = nn.Linear(13,1)
print(model.weight)
print(model.bias)

Parameter containing:
tensor([[-0.2756, -0.2611,  0.0373,  0.2061,  0.0084,  0.1929,  0.1455, -0.1950,
         -0.1927, -0.1262,  0.0174, -0.1820,  0.0172]], requires_grad=True)
Parameter containing:
tensor([-0.0702], requires_grad=True)


In [87]:
list(model.parameters())

[Parameter containing:
 tensor([[-0.2756, -0.2611,  0.0373,  0.2061,  0.0084,  0.1929,  0.1455, -0.1950,
          -0.1927, -0.1262,  0.0174, -0.1820,  0.0172]], requires_grad=True),
 Parameter containing:
 tensor([-0.0702], requires_grad=True)]

In [88]:
preds = model(housing)
preds

tensor([[ 0.0235],
        [-0.1572],
        [-0.0130],
        [-0.2035],
        [-0.1721],
        [ 0.0878],
        [-0.2850],
        [-0.4639],
        [-0.3561],
        [-0.0851],
        [-0.2792],
        [-0.1523],
        [-0.2009],
        [-0.3845],
        [-0.3022],
        [ 0.0277],
        [-0.4386],
        [-0.2749],
        [-0.4941],
        [-0.1330],
        [-0.2422],
        [-0.1622],
        [-0.2596],
        [-0.1284],
        [-0.0942],
        [-0.4111],
        [-0.1829],
        [-0.2075],
        [-0.2033],
        [-0.3388],
        [-0.0637],
        [-0.0192],
        [-0.2188],
        [-0.1375],
        [-0.1119],
        [-0.2051],
        [-0.3252],
        [-0.3093],
        [ 0.0027],
        [-0.5405],
        [-0.3038],
        [-0.0570],
        [-0.1759],
        [-0.2965],
        [-0.1374],
        [-0.2642],
        [-0.3408],
        [-0.3641],
        [-0.3489],
        [-0.0861],
        [-0.2273],
        [-0.3016],
        [-0.

In [0]:
loss_fn = F.mse_loss

In [90]:
loss = loss_fn(model(housing),housing_labels)

  """Entry point for launching an IPython kernel.


In [0]:
opt = torch.optim.SGD(model.parameters(),lr=0.0001)

In [0]:
def fit(num_epochs, model, loss_fn, opt, train_dl):
  for epoch in range(num_epochs):
    for xb,yb in train_dl:
      pred = model(xb)
      loss = loss_fn(pred,yb)
      loss.backward()
      opt.step()
      opt.zero_grad()
    if epoch%10 == 0:
      print('Epoch:',epoch,"Loss:",loss)

In [95]:
fit(100, model, loss_fn, opt, train_dl)

  """
  """


Epoch: 0 Loss: tensor(109.9302, grad_fn=<MseLossBackward>)
Epoch: 10 Loss: tensor(5.4137, grad_fn=<MseLossBackward>)
Epoch: 20 Loss: tensor(35.6738, grad_fn=<MseLossBackward>)
Epoch: 30 Loss: tensor(60.8391, grad_fn=<MseLossBackward>)
Epoch: 40 Loss: tensor(12.5574, grad_fn=<MseLossBackward>)
Epoch: 50 Loss: tensor(30.8676, grad_fn=<MseLossBackward>)
Epoch: 60 Loss: tensor(157.5648, grad_fn=<MseLossBackward>)
Epoch: 70 Loss: tensor(63.2430, grad_fn=<MseLossBackward>)
Epoch: 80 Loss: tensor(89.3865, grad_fn=<MseLossBackward>)
Epoch: 90 Loss: tensor(9.9349, grad_fn=<MseLossBackward>)
