# Introduction to Linear Regression

### import libs

In [1]:
import numpy as np
import pandas as pd
import io

In [2]:
from sklearn.linear_model import LinearRegression

In [3]:
from sklearn.metrics import mean_squared_error

### dataset

In [4]:
csv_data = '''
x1	x2	y
1	2	14
2	3	18
3	4	24 
4	5	32
'''

In [5]:
csv_data

'\nx1\tx2\ty\n1\t2\t14\n2\t3\t18\n3\t4\t24 \n4\t5\t32\n'

In [6]:
df = pd.read_csv(io.StringIO(csv_data), sep='\t')

In [7]:
df

Unnamed: 0,x1,x2,y
0,1,2,14
1,2,3,18
2,3,4,24
3,4,5,32


### Input & Output

In [8]:
X = df[['x1', 'x2']]

In [9]:
X

Unnamed: 0,x1,x2
0,1,2
1,2,3
2,3,4
3,4,5


In [10]:
y = df['y']

In [11]:
y

0    14
1    18
2    24
3    32
Name: y, dtype: int64

### Linear Model

In [12]:
regressor = LinearRegression()

In [13]:
regressor.fit(X,y)

LinearRegression()

In [14]:
regressor.coef_

array([3., 3.])

In [15]:
regressor.intercept_

4.000000000000007

In [16]:
# y = 4 + 3*x1 + 3*x2

### Evaluate

In [17]:
regressor.predict([(1,2), (2,3), (2,3)])

array([13., 19., 19.])

In [18]:
regressor.predict(X)

array([13., 19., 25., 31.])

In [19]:
y_pred = regressor.predict(X)

In [20]:
type(y_pred)

numpy.ndarray

In [21]:
type(y)

pandas.core.series.Series

### RMS

In [22]:
y - y_pred

0    1.0
1   -1.0
2   -1.0
3    1.0
Name: y, dtype: float64

In [23]:
(y - y_pred)**2

0    1.0
1    1.0
2    1.0
3    1.0
Name: y, dtype: float64

In [24]:
rmse = np.sqrt(((y - y_pred)**2).mean())

In [25]:
rmse

1.0

In [26]:
mean_squared_error(y,y_pred, squared=False)

1.0

### Train-Test split

In [27]:
df

Unnamed: 0,x1,x2,y
0,1,2,14
1,2,3,18
2,3,4,24
3,4,5,32


In [28]:
X

Unnamed: 0,x1,x2
0,1,2
1,2,3
2,3,4
3,4,5


In [29]:
y

0    14
1    18
2    24
3    32
Name: y, dtype: int64

In [30]:
X_train = X.loc[0:2]
y_train = y.loc[0:2]

X_test = X.loc[[3]]
y_test = y.loc[3]

In [31]:
# lib 
from sklearn.model_selection import train_test_split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [33]:
regressor = LinearRegression()

In [34]:
regressor.fit(X_train, y_train)

LinearRegression()

In [35]:
regressor.coef_

array([2.92857143, 2.92857143])

In [36]:
regressor.intercept_

4.785714285714288

In [37]:
# y = 3.9285714285714306 + 3.07142857*x1 + 3.07142857*x2

In [38]:
y_pred = regressor.predict(X_test)

In [39]:
y_pred

array([19.42857143])

In [40]:
y_test

1    18
Name: y, dtype: int64

In [41]:
mean_squared_error(y_test, y_pred, squared=False)

1.4285714285714306

## K-Fold Cross-Validation

In [42]:
#lib
from sklearn.model_selection import KFold

In [43]:
kf = KFold(n_splits=4, shuffle=True)

In [53]:
kf.get_n_splits(X)

4

In [52]:
for train_index, test_index in kf.split(X):
	print('train index = ', train_index)
	print('test index = ', test_index)
	print('-----------------------')


train index =  [0 1 2]
test index =  [3]
-----------------------
train index =  [0 2 3]
test index =  [1]
-----------------------
train index =  [0 1 3]
test index =  [2]
-----------------------
train index =  [1 2 3]
test index =  [0]
-----------------------


In [46]:
i = 0
RMSEs = []

for train_index, test_index in kf.split(X):
	X_train = X.loc[train_index]
	X_test = X.loc[test_index]
	y_train = y.loc[train_index]
	y_test = y.loc[test_index]

	# create a model
	regressor = LinearRegression()
	regressor.fit(X_train, y_train)

	# evaluate the model
	y_pred = regressor.predict(X_test)
	rmse_i = mean_squared_error(y_test, y_pred, squared=False)

	RMSEs.append(rmse_i)

In [47]:
RMSEs

[3.333333333333327, 1.4285714285714306, 1.428571428571427, 3.3333333333333357]

In [48]:
np.mean(RMSEs)

2.38095238095238