# Linear Regression

## Regression on a simple dataset

In [4]:
import pandas as pd 

path = '../datasets/50_Startups.csv'
dataset = pd.read_csv(path)
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [2]:
# Encoding categorical data
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [3]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [4]:
from LinearRegression import LinearRegression
regressor = LinearRegression(mode=2)
regressor.fit(X_train, y_train)

[-771.07991043 -770.97704475 -770.98396591  257.10546253  703.27197797
 1982.47205407]
[-1507.34869883 -1507.24717125 -1507.25267898   501.02949006
  1375.19018564  3857.93809645]
[-2210.55643985 -2210.45621585 -2210.46034385   732.55808469
  2017.35704182  5631.79173874]
[-2882.36380781 -2882.26485442 -2882.26763476   952.34697887
  2631.2662361   7309.26435962]
[-3524.34675216 -3524.24903794 -3524.25050113  1161.01818451
  3218.33524577  8895.31952345]
[-4138.00082953 -4137.90432444 -4137.90449954  1359.16171909
  3779.90923266 10394.66668029]
[-4724.74531415 -4724.64998955 -4724.64890421  1547.33724344
  4317.26474064 11811.77416519]
[-5285.9270981  -5285.83292662 -5285.83060717  1726.07561577
  4831.61320402 13150.88153255]
[-5822.82439205 -5822.73134753 -5822.72781905  1895.88036614
  5324.10427659 14416.01125966]
[-6336.65023668 -6336.55829415 -6336.55358051  2057.22909557
  5795.82899025 15610.97985145]
[-6828.55583459 -6828.46497017 -6828.45909412  2210.57480358
  6247.8227522 

In [5]:
# Predicting the Test set results
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[101047.32511392562 103282.38]
 [127872.52875823203 144259.4]
 [128620.62656136246 146121.95]
 [72243.06965094828 77798.83]
 [171431.66304172942 191050.39]
 [111517.45562502078 105008.31]
 [66853.83067413131 81229.06]
 [95705.56406944158 97483.56]
 [111135.8539301893 110352.25]
 [161612.88741360742 166187.94]]


In [6]:
from LinearRegression import r2_score

r2_score(y_test, y_pred)

R-squared: 0.9007496120264314


## Regression on a more complicated dataset

In [6]:
path = '../datasets/Combined_cycle_plants.ods'
dataset = pd.read_excel(path, engine='odf', sheet_name="Sheet1")
dataset.head()

Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.4,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.5,1009.23,96.62,473.9


In [8]:
dataset.info(verbose="True")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9568 entries, 0 to 9567
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AT      9568 non-null   float64
 1   V       9568 non-null   float64
 2   AP      9568 non-null   float64
 3   RH      9568 non-null   float64
 4   PE      9568 non-null   float64
dtypes: float64(5)
memory usage: 373.9 KB


## Data Preprocessing

In [9]:
import numpy as np

target = dataset['PE']
y = np.asarray(target)
X = dataset.iloc[:, 0:4].values

#print(X)
#print(y)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.3,
                                                    stratify=None, #preserve target proportions 
                                                    random_state= 123) #fix random seed for replicability

print(X_train.shape, X_test.shape)

(6697, 4) (2871, 4)


In [11]:
from LinearRegression import LinearRegression
regressor = LinearRegression( learning_rate=0.2)
regressor.fit(X_train, y_train)

[-58.76 -51.33 157.25 -47.03]
[-70.59 -61.67 188.75 -56.42]
[-72.98 -63.79 195.05 -58.26]
[-73.49 -64.26 196.3  -58.58]
[-73.62 -64.4  196.54 -58.6 ]
[-73.68 -64.48 196.58 -58.55]
[-73.72 -64.54 196.57 -58.5 ]
[-73.75 -64.6  196.56 -58.44]
[-73.79 -64.66 196.54 -58.38]
[-73.82 -64.71 196.53 -58.32]
[-73.86 -64.77 196.51 -58.27]
[-73.89 -64.83 196.5  -58.21]
[-73.93 -64.89 196.48 -58.15]
[-73.96 -64.95 196.47 -58.09]
[-74.   -65.01 196.45 -58.03]
[-74.03 -65.07 196.44 -57.98]
[-74.07 -65.13 196.43 -57.92]
[-74.1  -65.18 196.41 -57.86]
[-74.14 -65.24 196.4  -57.8 ]
[-74.18 -65.3  196.38 -57.74]
[-74.21 -65.36 196.37 -57.69]
[-74.25 -65.42 196.35 -57.63]
[-74.28 -65.48 196.34 -57.57]
[-74.32 -65.54 196.32 -57.51]
[-74.35 -65.6  196.31 -57.45]
[-74.39 -65.65 196.29 -57.4 ]
[-74.42 -65.71 196.28 -57.34]
[-74.46 -65.77 196.26 -57.28]
[-74.49 -65.83 196.25 -57.22]
[-74.53 -65.89 196.23 -57.17]
[-74.56 -65.95 196.22 -57.11]
[-74.6  -66.01 196.2  -57.05]
[-74.63 -66.06 196.19 -56.99]
[-74.67 -6

In [12]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[455.98 455.41]
 [448.62 441.64]
 [463.38 471.63]
 ...
 [460.9  464.38]
 [453.82 450.72]
 [440.62 432.68]]


In [13]:
from LinearRegression import r2_score

r2_score(y_test, y_pred)

R-squared: 0.6712305130991498


## Comparison with Scikit-learn API

In [14]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)

for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regressor.coef_[0][idx]))

y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[447.73 455.41]
 [437.86 441.64]
 [471.82 471.63]
 ...
 [462.6  464.38]
 [452.19 450.72]
 [429.7  432.68]]


In [15]:
from sklearn.metrics import r2_score

# Calcola il coefficiente R²
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

R-squared: 0.9232530522555914
