# Linear Regression

## Part 1 - Data Preprocessing

In [83]:
import pandas as pd

### Importing the dataset

In [84]:
import os
print(os.getcwd())

/Users/oscarsanchez/PYTHON/MLApprentice/MachineLearning_CodesAndDatasets/Level1/Regression


In [36]:
df = pd.read_excel(f'{os.getcwd()}/data.xlsx')

In [37]:
df.head()

Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.4,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.5,1009.23,96.62,473.9


In [38]:
df.tail()

Unnamed: 0,AT,V,AP,RH,PE
9563,16.65,49.69,1014.01,91.0,460.03
9564,13.19,39.18,1023.67,66.78,469.62
9565,31.32,74.33,1012.92,36.48,429.57
9566,24.48,69.45,1013.86,62.39,435.74
9567,21.6,62.52,1017.23,67.87,453.28


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9568 entries, 0 to 9567
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AT      9568 non-null   float64
 1   V       9568 non-null   float64
 2   AP      9568 non-null   float64
 3   RH      9568 non-null   float64
 4   PE      9568 non-null   float64
dtypes: float64(5)
memory usage: 373.9 KB


In [40]:
df.shape

(9568, 5)

In [41]:
df.describe()

Unnamed: 0,AT,V,AP,RH,PE
count,9568.0,9568.0,9568.0,9568.0,9568.0
mean,19.651231,54.305804,1013.259078,73.308978,454.365009
std,7.452473,12.707893,5.938784,14.600269,17.066995
min,1.81,25.36,992.89,25.56,420.26
25%,13.51,41.74,1009.1,63.3275,439.75
50%,20.345,52.08,1012.94,74.975,451.55
75%,25.72,66.54,1017.26,84.83,468.43
max,37.11,81.56,1033.3,100.16,495.76


In [42]:
missing_values = df.isnull().sum()
print(missing_values)
missing_percent = missing_values/len(df) * 100

AT    0
V     0
AP    0
RH    0
PE    0
dtype: int64


In [43]:
for column, percent in missing_percent[missing_percent > 0].items():
  print(f"{column}: {percent:2f}%")

### Getting the inputs and output

In [None]:
# feature maxtrix, which is a 2D numpy array
X = df.iloc[:, :-1].values
# target vector, which is a 1D numpy array
y = df.iloc[:, -1].values

In [46]:
X

array([[  14.96,   41.76, 1024.07,   73.17],
       [  25.18,   62.96, 1020.04,   59.08],
       [   5.11,   39.4 , 1012.16,   92.14],
       ...,
       [  31.32,   74.33, 1012.92,   36.48],
       [  24.48,   69.45, 1013.86,   62.39],
       [  21.6 ,   62.52, 1017.23,   67.87]])

In [47]:
y

array([463.26, 444.37, 488.56, ..., 429.57, 435.74, 453.28])

In [48]:
print(X.shape)
print(y.shape)

(9568, 4)
(9568,)


### Creating the Training Set and the Test Set

In [49]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Part 2 - Building and training the model

### Building the model

In [50]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

### Training the model

In [51]:
model.fit(X_train, y_train)

LinearRegression()

### Inference

Making the predictions of the data points in the test set

In [57]:
y_predictions = model.predict(X_test)

In [58]:
y_predictions

array([431.42761597, 458.56124622, 462.75264705, ..., 469.51835895,
       442.41759454, 461.88279939])

In [60]:
y_test

array([431.23, 460.01, 461.14, ..., 473.26, 438.  , 463.28])

Making a prediction with a single row of the data from the independent feature set

In [73]:
X_test[:1]

array([[  28.66,   77.95, 1009.56,   69.07]])

In [None]:
# 2D array must be passed in to model.predict, just like X_test is a 2D array
y_single_prediction = model.predict(X_test[:1])

In [74]:
y_single_prediction[0]

431.42761597061804

In [None]:
# corresponding row in the dependent vector
y_test[0]

431.23

## Part 3: Evaluating the model

### R-Squared

In [75]:
from sklearn.metrics import r2_score

In [76]:
r2 = r2_score(y_test, y_predictions)
r2

0.9325315554761303

# n = total number of observations in the test set
# k = total number of features

In [None]:
# (n , k)
X_test.shape

(1914, 4)

In [81]:
n, k = X_test.shape
print(n)
print(k)

1914
4


### Adjusted R-Squared

In [82]:
adj_r2 = 1-(1 - r2) * (n - 1) / (n - k - 1)
adj_r2

0.9323901862890713