In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import KFold

#### Load the dataset from the file Ecommerce.csv

```
This dataset includes the following data for 200 online customers:

- Email
- Adress
- Avatar
- Avg Session Length (everage time they spent on usage)
- Time on App (time they spent using the application)
- Time on Website (time they spent using the website)
- Length of Membership (for how long they have been a user)
- Yearly Amount Spent (how much money they spend yearly)

Yout goal will be to estimate the value of the Yearly Amount Spent using the feature(s) of the users
```

#### Get rid of non-numerical columns of the dataset

In [None]:
data = pd.read_csv("Ecommerce.csv")
data.head()

In [None]:
data.info()

In [None]:
data_num = data[['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership']]
Y = data[['Yearly Amount Spent']]

#### Use Yearly Amount Spent as your target variable. Plot the relationship of each of the numerical features with the target variable

In [None]:
fig, axes = plt.subplots(2,2, figsize = (20, 15))
fig.tight_layout

axes[0][0].scatter(data["Avg Session Length"], data["Yearly Amount Spent"])
axes[0][0].set_xlabel("Avg Session Length", size = 13)
axes[0][0].set_ylabel("Yearly Amount Spent", size = 13)

axes[0][1].scatter(data["Time on App"], data["Yearly Amount Spent"])
axes[0][1].set_xlabel("Time on App", size = 13)
axes[0][1].set_ylabel("Yearly Amount Spent", size = 13)

axes[1][0].scatter(data["Time on Website"], data["Yearly Amount Spent"])
axes[1][0].set_xlabel("Time on Website", size = 13)
axes[1][0].set_ylabel("Yearly Amount Spent", size = 13)

axes[1][1].scatter(data["Length of Membership"], data["Yearly Amount Spent"])
axes[1][1].set_xlabel("Length of Membership", size = 13)
axes[1][1].set_ylabel("Yearly Amount Spent", size = 13)

plt.show

#### Leave only one feature that has a linear relationship with the target variable. Divide the dataset into training and test sets with the ratio 80:20.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data[['Length of Membership']], data['Yearly Amount Spent'], test_size = 0.2, random_state=0)

In [None]:
X_train = np.array(X_train)

In [None]:
X_test = np.array(X_test)

#### Use the resulting train and test sets with the next 2 functions to fit linear regression and plot the resulting regression line.

The class is `LinReg()` that finds equation of the line which best describes the fitted data. Using that equation model predicts target value for unseen data, and calculates R2 score for predicted values. 

In [None]:
class LinReg():
    """
    :param data_vector: A pandas series object (i.e. a column of a dataframe),
                        where each element is a data point (x)
    :param response_vector: A pandas series object (i.e. a column of a dataframe),
                        where each element is label (y)
    :return: numpy array beta of coefficients of the regression
    Note: You can avoid the intercept for simplicity
    """
    def __init__(self):
        self.X_train = None
        self.y_train = None
        self.coef = None
        self.intercept = None
        
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        self.data_matrix = np.append(np.ones((self.X_train.shape[0], 1)), self.X_train, 1)
        self.betta = np.dot(np.dot(np.linalg.inv(np.dot(self.data_matrix.T, self.data_matrix)), self.data_matrix.T), self.y_train)
        self.intercept = self.betta[0]
        self.coef = self.betta[1:]
        
    def predict(self, X_test):
        y_pred = self.intercept + np.dot(X_test, self.coef)
        return y_pred
    
    def score(self, X, y):
        return r2_score(y, self.predict(X))

In [None]:
reg = LinReg()
reg.fit(X_train, y_train)

In [None]:
reg.coef, reg.intercept

In [None]:
predictions = reg.predict(X_test)

The second function is `plot_line` function, that takes $\beta_0, \beta_1, x, y$ and plots a scatter plot of the data, and the line $y = \beta_0 + \beta_1 x$. And saves the plot in `YOURNAME.png`

In [None]:
def plot_fitted_line(b0: float,
                     b1: float,
                     X,
                     y):
    """
    :param b0: Intersept of line to plot
    Note: avoid this if you don't have an intercept in the previous function
    :param b1: Slope of the line to plot
    :param X: A pandas series object (i.e. a column of a dataframe),
                        where each element is a data point (x coordinates)
    :param y: A pandas series object (i.e. a column of a dataframe),
                        where each element is a label (y coordinates)
    """
    y_line = [b1*x + b0 for x in X]
    plt.plot(X, y_line, c='red')
    plt.scatter(X, y)
    plt.savefig(fname='MyFigure.png')
    plt.show

In [None]:
plot_fitted_line(reg.intercept, reg.coef, X_train, y_train)

#### Now fit the linear regression to the data containing only 1 feature using sklearn and compare the results with your own fit_1d_linear_regression function

In [None]:
model = linear_model.LinearRegression()
model.fit(X_train, y_train)

In [None]:
model.coef_, model.intercept_

In [None]:
plot_fitted_line(model.intercept_, model.coef_[0], X_train, y_train)

#### Use sklearn.metrics to calculate Mean absolute error, Mean squared error, Root mean squared error and R-squared for the results of your own function vs the sklearn regression results. Try to interprete your results.

In [None]:
my_predictions = reg.predict(X_test)
y_pred = model.predict(X_test)

In [None]:
print(f'Mean absolute error for my model: {mean_absolute_error(y_test, my_predictions)}')
print(f'Mean absolute error for orginal model: {mean_absolute_error(y_test, y_pred)}\n')

print(f'Mean squared error for my model: {mean_squared_error(y_test, my_predictions)}')
print(f'Mean squared error for original model: {mean_squared_error(y_test, y_pred)}\n')

print(f'Root mean squared error for my model: {np.sqrt(mean_squared_error(y_test, my_predictions))}')
print(f'Root mean squared error for original model: {np.sqrt(mean_squared_error(y_test, y_pred))}\n')

print(f'R-squared for my model: {r2_score(y_test, my_predictions)}')
print(f'R-squared for original model: {r2_score(y_test, y_pred)}')

#### Fit linear regression to the data containing all numerical features using sklearn and use the same metrics to evaluate how good the model is.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_num, Y, test_size=0.2)

In [None]:
full_reg = LinReg()
full_reg.fit(X_train, y_train)
predictions = full_reg.predict(X_test)

In [None]:
model_full = linear_model.LinearRegression()
model_full.fit(X_train, y_train)
y_pred = model_full.predict(X_test)

In [None]:
print(f'Mean absolute error for my model: {mean_absolute_error(y_test, predictions)}')
print(f'Mean absolute error for orginal model: {mean_absolute_error(y_test, y_pred)}\n')

print(f'Mean squared error for my model: {mean_squared_error(y_test, predictions)}')
print(f'Mean squared error for original model: {mean_squared_error(y_test, y_pred)}\n')

print(f'Root mean squared error for my model: {np.sqrt(mean_squared_error(y_test, predictions))}')
print(f'Root mean squared error for original model: {np.sqrt(mean_squared_error(y_test, y_pred))}\n')

print(f'R-squared for my model: {r2_score(y_test, predictions)}')
print(f'R-squared for original model: {r2_score(y_test, y_pred)}')