<a href="https://colab.research.google.com/github/1exip/Learning-PyTorch/blob/master/BostonLinearRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import io
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler
import torch.nn as nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import torch.nn.functional as F

In [0]:
# Get pandas dataframe from raw csv link
URL = 'https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv'
csv = requests.get(URL).content
housing_df = pd.read_csv(io.StringIO(csv.decode('utf-8')))

In [3]:
#Get a look at the data
housing_df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
#See if there are any null values
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  b        506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [0]:
#Convert int values into float
housing_df['chas'] = housing_df['chas'].astype(float)
housing_df['rad'] = housing_df['rad'].astype(float)
housing_df['tax'] = housing_df['tax'].astype(float)

In [6]:
#See data info
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    float64
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    float64
 9   tax      506 non-null    float64
 10  ptratio  506 non-null    float64
 11  b        506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


In [7]:
#Create a correlation matrix in order to find what attribute to use for stratified split
correlation = housing_df.corr()
correlation['medv'].sort_values(ascending=False)

medv       1.000000
rm         0.695360
zn         0.360445
b          0.333461
dis        0.249929
chas       0.175260
age       -0.376955
rad       -0.381626
crim      -0.388305
nox       -0.427321
tax       -0.468536
indus     -0.483725
ptratio   -0.507787
lstat     -0.737663
Name: medv, dtype: float64

In [8]:
#See max and min values for the most correlated value in order to create categories to stratify the dataset on
print(np.max(housing_df['lstat']))
print(np.min(housing_df['lstat']))

37.97
1.73


In [0]:
#Create a category for stratification
housing_df['lstat_cat'] = pd.cut(housing_df['lstat'],
                           bins=[1.,5.,10.,15.,20.,25.,30.,35.,np.inf],
                           labels=[1,2,3,4,5,6,7,8])

In [0]:
#Initialize the stratified splitting function and split the dataset into stratified train and test data
stratsplit = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in stratsplit.split(housing_df, housing_df['lstat_cat']):
  strat_train = housing_df.loc[train_index]
  strat_test = housing_df.loc[test_index]

In [11]:
#Check how well the function performed by calculating the ratio of the lstat_cat attribute to the value counts in each set
print('Stratified Test Set')
print(strat_test['lstat_cat'].value_counts(ascending=False) / len(strat_test))
print('\n')
print('Stratified Train Set')
print(strat_train['lstat_cat'].value_counts(ascending=False) / len(strat_train))

Stratified Test Set
2    0.313725
3    0.245098
4    0.176471
1    0.127451
5    0.078431
6    0.039216
7    0.019608
8    0.000000
Name: lstat_cat, dtype: float64


Stratified Train Set
2    0.309406
3    0.247525
4    0.173267
1    0.121287
5    0.079208
6    0.044554
7    0.019802
8    0.004950
Name: lstat_cat, dtype: float64


In [0]:
#Remove the lstat_cat attribute from both sets as we don't need it anymore
for set in (strat_train, strat_test):
  set.drop('lstat_cat',axis=1,inplace=True)

In [0]:
#Split the data into input and output values
housing = strat_train.drop('medv',axis=1)
housing_labels = strat_train['medv'].copy()
housing_test = strat_test.drop('medv',axis=1)
housing_test_labels = strat_test['medv'].copy()

In [0]:
#Initialize a min-max normalizing function for better predictions
scaler = MinMaxScaler()
housing = scaler.fit_transform(housing)
housing_test = scaler.fit_transform(housing_test)

In [15]:
housing

array([[0.00195926, 0.        , 0.92122126, ..., 0.69148936, 0.93295174,
        0.65342163],
       [0.00377544, 0.        , 0.23256691, ..., 0.74468085, 0.9854506 ,
        0.27621413],
       [0.02725324, 0.        , 0.69242367, ..., 0.22340426, 0.93040496,
        0.05435982],
       ...,
       [0.03227379, 0.        , 0.69242367, ..., 0.22340426, 0.43519593,
        0.71937086],
       [0.00118948, 0.13157895, 0.25103656, ..., 0.27659574, 0.98386202,
        0.38576159],
       [0.00293168, 0.21052632, 0.21673577, ..., 0.63829787, 0.98454284,
        0.13383002]])

In [0]:
#Converting the dataframes into numpy arrays
housing = np.array(housing, dtype='float32')
housing_labels = np.array(housing_labels, dtype='float32')
housing_test = np.array(housing_test, dtype='float32')
housing_test_labels = np.array(housing_test_labels, dtype='float32')

In [0]:
#Converting the numpy arrays to PyTorch tensors
housing = torch.from_numpy(housing)
housing_labels = torch.from_numpy(housing_labels)
housing_test = torch.from_numpy(housing_test)
housing_test_labels = torch.from_numpy(housing_test_labels)

In [0]:
#Initialise the TensorDataset function
train_ds = TensorDataset(housing, housing_labels)

In [0]:
#Get a training dataset with a batch size of 5
train_dl = DataLoader(train_ds, batch_size=5)

In [20]:
#Initialize the linear regression model
model = nn.Linear(13,1)
print(model.weight)
print(model.bias)

Parameter containing:
tensor([[-0.1368, -0.1292, -0.0351,  0.1844, -0.2166, -0.2030, -0.1623,  0.1111,
         -0.0313,  0.2620,  0.0073,  0.1425, -0.0525]], requires_grad=True)
Parameter containing:
tensor([-0.0236], requires_grad=True)


In [21]:
#Check model parameters
list(model.parameters())

[Parameter containing:
 tensor([[-0.1368, -0.1292, -0.0351,  0.1844, -0.2166, -0.2030, -0.1623,  0.1111,
          -0.0313,  0.2620,  0.0073,  0.1425, -0.0525]], requires_grad=True),
 Parameter containing:
 tensor([-0.0236], requires_grad=True)]

In [0]:
#Create prediction data by running it through the untrained model
preds = model(housing)

In [0]:
#Initialize the loss function
loss_fn = F.mse_loss

In [24]:
#Get shape info to see if we have to reshape anything
print(np.shape(preds))
print(np.shape(housing_labels))

torch.Size([404, 1])
torch.Size([404])


In [0]:
#Reshape the labels into (-1,1) format
housing_labels = housing_labels.reshape(-1,1)

In [0]:
#Calculate the loss
loss = loss_fn(model(housing),housing_labels)

In [0]:
#Initialize the stochastic gradient descent function
opt = torch.optim.SGD(model.parameters(),lr=0.001)

In [0]:
#Function for fitting the data
def fit(num_epochs, model, loss_fn, opt, train_dl):
  #Set number of epochs
  for epoch in range(num_epochs+1):
    #Extract input and output from train_dl
    for xb,yb in train_dl:
      #Reshape the output to (-1,1) format
      yb = yb.reshape(-1,1)
      #Predict values 
      pred = model(xb)
      #Calculate loss
      loss = loss_fn(pred,yb)
      #Calculate the derivative of the loss function
      loss.backward()
      #Adjust the parameters
      opt.step()
      #Set gradients to 0
      opt.zero_grad()
    #Print epch count and loss value every 10 epochs
    if epoch%10 == 0:
      print('Epoch:',epoch,"Loss:",loss)

In [29]:
#Run the function with a 100 epochs on the train_dl dataset
fit(150, model, loss_fn, opt, train_dl)

Epoch: 0 Loss: tensor(238.4255, grad_fn=<MseLossBackward>)
Epoch: 10 Loss: tensor(32.6896, grad_fn=<MseLossBackward>)
Epoch: 20 Loss: tensor(18.8089, grad_fn=<MseLossBackward>)
Epoch: 30 Loss: tensor(13.5214, grad_fn=<MseLossBackward>)
Epoch: 40 Loss: tensor(11.1624, grad_fn=<MseLossBackward>)
Epoch: 50 Loss: tensor(9.8856, grad_fn=<MseLossBackward>)
Epoch: 60 Loss: tensor(9.0559, grad_fn=<MseLossBackward>)
Epoch: 70 Loss: tensor(8.4415, grad_fn=<MseLossBackward>)
Epoch: 80 Loss: tensor(7.9503, grad_fn=<MseLossBackward>)
Epoch: 90 Loss: tensor(7.5407, grad_fn=<MseLossBackward>)
Epoch: 100 Loss: tensor(7.1909, grad_fn=<MseLossBackward>)
Epoch: 110 Loss: tensor(6.8880, grad_fn=<MseLossBackward>)
Epoch: 120 Loss: tensor(6.6231, grad_fn=<MseLossBackward>)
Epoch: 130 Loss: tensor(6.3897, grad_fn=<MseLossBackward>)
Epoch: 140 Loss: tensor(6.1827, grad_fn=<MseLossBackward>)
Epoch: 150 Loss: tensor(5.9980, grad_fn=<MseLossBackward>)


In [30]:
#Reshape data, make predictions on the test set and print out loss
pred = model(housing_test)
pred = pred.reshape(-1,1)
housing_test_labels = housing_test_labels.reshape(-1,1)
loss = loss_fn(pred,housing_test_labels)
print(loss)

tensor(40.4573, grad_fn=<MseLossBackward>)
