In [8]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [26]:
def readCSV(fileName: str):
    """ Read the .csv file and combine all data into a large dataset

    Read .csv file from current folder, put all records into numpy array, and 
    return the dataset and labels.

    Arg:
        fileName(str): The string that contain the .csv file location.

    Return:
        A numpy array that contain all record from .csv file.
        The labels for all features.

    Rasies:
        File Error: File not exist or not in the correct location.
    """
    pass

    try:
        data = np.loadtxt(fileName, delimiter=',', skiprows=1)
        datasets = data[:, 1:]
        labels = data[:, 0:1]
        return datasets, np.ravel(labels)
    except:
        print("Open file error")

In [27]:
fileName = "accelerometer.csv"
X, y = readCSV(fileName)

In [11]:
X

array([[ 2.000e+01,  1.004e+00,  9.000e-02, -1.250e-01],
       [ 2.000e+01,  1.004e+00, -4.300e-02, -1.250e-01],
       [ 2.000e+01,  9.690e-01,  9.000e-02, -1.210e-01],
       ...,
       [ 1.000e+02,  1.156e+00, -9.400e-02, -2.270e-01],
       [ 1.000e+02,  9.340e-01,  2.030e-01, -1.720e-01],
       [ 1.000e+02,  1.199e+00, -1.760e-01,  1.090e-01]])

In [12]:
y

array([1., 1., 1., ..., 3., 3., 3.])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=32)

In [17]:
lin_reg = LinearRegression().fit(X_train, y_train)
lin_reg

LinearRegression()

In [35]:
y_pred = lin_reg.predict(X_test)
print("The training dataset size is: ", len(X_train))
print("The testing dataset size is: ", len(X_test))
print("The feature size is: ", len(X_train[0]))
print("The R-squared value is: ", lin_reg.score(X_test, y_test))
print("The rss is: ", np.sum(np.square(y_pred - y_test)))
print("The correlation coefficient is: ", np.corrcoef(y_pred, y_test)[0, 1])
print("The weighted rss is: ", np.sum(np.square(y_pred - y_test)) / len(y_test))
print("The intercept is: ", lin_reg.intercept_)

The training dataset size is:  114750
The testing dataset size is:  38250
The feature size is:  4
The R-squared value is:  -0.00011956232396514999
The rss is:  25446.625989445376
The correlation coefficient is:  0.007135203992771844
The weighted rss is:  0.6652712676979183
The intercept is:  2.00275265822727
