In [1]:
import numpy as np
import pandas as pd

In [2]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

## Load training dataset

In [3]:
train_df = pd.read_csv("train.csv")

In [4]:
train_df.head()

Unnamed: 0,x,y
0,24.0,21.549452
1,50.0,47.464463
2,15.0,17.218656
3,38.0,36.586398
4,87.0,87.288984


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       700 non-null    float64
 1   y       699 non-null    float64
dtypes: float64(2)
memory usage: 11.1 KB


## Clean training dataset

In [6]:
train_df = train_df.dropna()

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 699 entries, 0 to 699
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       699 non-null    float64
 1   y       699 non-null    float64
dtypes: float64(2)
memory usage: 16.4 KB


In [8]:
train_X = train_df["x"].to_numpy()
train_Y = train_df["y"].to_numpy()

## Estimate linear regression coefficients

In [9]:
train_X_mean = np.mean(train_X)
train_Y_mean = np.mean(train_Y)

In [10]:
num = 0
dnum = 0

for i in range(len(train_X)):
    num += (train_X[i] - train_X_mean) * (train_Y[i] - train_Y_mean)
    dnum += (train_X[i] - train_X_mean) ** 2

w1 = num / dnum
w0 = train_Y_mean - w1 * train_X_mean

print("Intercept: ", w0)
print("Slope: ", w1)

Intercept:  -0.10726546430100825
Slope:  1.0006563818563046


## Load testing dataset

In [11]:
test_df = pd.read_csv("test.csv")

In [12]:
test_df.head()

Unnamed: 0,x,y
0,77,79.775152
1,21,23.177279
2,22,25.609262
3,20,17.857388
4,36,41.849864


In [13]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       300 non-null    int64  
 1   y       300 non-null    float64
dtypes: float64(1), int64(1)
memory usage: 4.8 KB


In [14]:
test_X = test_df["x"].to_numpy()
test_Y = test_df["y"].to_numpy()

## Predict Y for testing data

In [15]:
pred_Y = []

for x in test_X:
    pred_Y.append(w0 + w1 * x)

## Evaluate model performance by using RMSE(root mean square as the metric)

In [16]:
RMSE = np.sqrt(((test_Y - pred_Y) ** 2).mean())

In [17]:
print("Root Mean Square Error: ", RMSE)

Root Mean Square Error:  3.0713062680298244


## Linear regression using Scikit 
### (Just to check whether the manualy calculated results are correct or not)

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [19]:
model = LinearRegression().fit(np.array(train_X).reshape((-1, 1)), train_Y)

In [20]:
print('intercept:', model.intercept_)
print('slope:', model.coef_)

intercept: -0.10726546430097272
slope: [1.00065638]


In [21]:
pred_Y = model.predict(np.array(test_X).reshape((-1, 1)))

In [22]:
metrics.mean_squared_error(test_Y, pred_Y, squared=False)

3.071306268029827