In [1]:
# Mean absolute error
def MAE(y, y_pred):
    return np.mean(np.abs(y - y_pred))

In [None]:
# from LinearRegression
# Root mean squared error
def RMSE(y, y_pred):
    mse = np.mean(np.square(y - y_pred))  # MSE
    return np.sqrt(mse)  # RMSE


In [None]:
# from LinearRegression
# Verify RSS score
def RSS(y, y_pred):
    return np.sum(np.square(np.subtract(y, y_pred)))

In [None]:
#preprocess data :
# remove na column and hot-encoded the qualitative ones.

def preprocess(df):
    # Work on a copy 
    df = df.copy()
    
    # One-hot encoding
    df = pd.get_dummies(df, dummy_na = True)
    
    # Fill missing values
    for c in df.columns:
        df[c] = df[c].fillna(df[c].median())
    
    return df

In [None]:
#Standardize feature (mean = 0, sd= 1):
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
X_tr_rescaled = scaler.fit_transform(X_tr)
X_val_rescaled = scaler.transform(X_val)

In [None]:
#Linear regression 
from sklearn.linear_model import LinearRegression

# Try with a linear regression
lr = LinearRegression()
lr.fit(X_tr_rescaled, y_tr)

# Compute predictions
y_val_lr =lr.predict(X_val_rescaled)

# Plot the model
plt.scatter(y_val_lr, y_val) #pred vs obs

In [None]:
#Ridge regression
ridge2 = Ridge(alpha=1e-4)
ridge2.fit(X_rescaled, y)

# Compute predictions
y_values_ridge2 = ridge2.predict(X_values_rescaled)

# Plot the model
plt.scatter(X_rescaled[:, 0], y)
plt.plot(X_values_rescaled[:, 0], y_values_ridge2, c="C3", label="tuned ridge")
plt.legend()
plt.show()

# Ridge regression coefficients
for feature, coef in zip(features, ridge2.coef_):
    print("{:<6}: {:>4.1f}".format(feature,coef))

In [None]:
#Lasso regression :
from sklearn.linear_model import Lasso

# Lasso regression
lasso = Lasso(alpha=1e-4, max_iter=1e5) # need iteration max and  alpha to set the regularization strength
lasso.fit(X_rescaled, y)

# Compute predictions
y_values_lasso = lasso.predict(X_values_rescaled)

# Plot the model
plt.scatter(X_rescaled[:, 0], y)
plt.plot(X_values_rescaled[:, 0], y_values_lasso, c="C3", label="lasso")
plt.legend()
plt.show()

# Lasso regression coefficients
for feature, coef in zip(features, lasso.coef_):
    print("{:<6}: {:>4.1f}".format(feature, coef))

In [None]:
#Ordinary least square from LinearRegression
def compute_ols_with_noise(temp_C, users):

    # Convert to degrees Fahrenheit
    temp_F = 1.8 * temp_C + 32

    # Add small variations
    noise = np.random.normal(loc=0, scale=0.01, size=temp_F.shape)
    temp_F += noise

    # Create input matrix X
    X = np.c_[temp_C, temp_F]

    # Compute OLS using lstsq
    X1 = np.c_[np.ones(X.shape[0]), X]  # Create X1 matrix
    w, rss, rank, _ = lstsq(X1, users)  # OLS

    return w, rss, rank, X1

In [None]:
#Ridge  with Regularization from LinearRegression
from sklearn.linear_model import Ridge


def compute_with_regularization(temp_C, users):

    # Add small variations
    noise = np.random.normal(loc=0, scale=0.01, size=temp_C.shape)
    temp_F = (1.8 * temp_C + 32) + noise

    # Create input matrix X
    X = np.c_[temp_C, temp_F]

    # Fit a Ridge regression
    ridge = Ridge(alpha=100)
    ridge.fit(X, users)

    return ridge, X


In [None]:
#Error surface fron gradientDescent
# Plot the error surface
def visualize_steps(fig, axis, log_a, log_b, x, y):
    # Define a grid of a,b parameters
    min_ab = min(min(log_a), min(log_b))
    max_ab = max(max(log_a), max(log_b))

    d = max_ab - min_ab
    min_ab -= d * 0.1
    max_ab += d * 0.1

    a = np.linspace(min_ab, max_ab, num=40)
    b = np.linspace(min_ab, max_ab, num=40)
    a_grid, b_grid = np.meshgrid(a, b)

    # Compute the RMSE score for each a,b pair on that grid
    rmse_grid = np.zeros_like(a_grid)

    for i in range(40):
        for j in range(40):
            a, b = a_grid[i, j], b_grid[i, j]
            rmse_grid[i, j] = rmse(a * x + b, y)

    # RMSE surface
    axis.set_aspect("equal", adjustable="box")
    mpl_contourset = axis.contourf(a_grid, b_grid, rmse_grid, 20, cmap=plt.cm.coolwarm)
    fig.colorbar(mpl_contourset, ax=axis, label="RMSE")

    # Plot the GD steps
    axis.plot(log_a, log_b, c="#00abe9")
    axis.scatter(log_a, log_b, c="#00abe9")

    # Set titles and labels
    axis.set_xlabel("parameter a")
    axis.set_ylabel("parameter b")

    axis.set_xlim(min_ab, max_ab)
    axis.set_ylim(min_ab, max_ab)

In [None]:
#https://medium.com/nerd-for-tech/dealing-with-missing-data-using-python-3fd785b77a05
def plot_missingdata(df:pd.DataFrame, title:str, xlabel:str, ylabel:str):
    ''' This function takes a data frame as input plots the list of columns with corresponding total number of missing values'''
    # Let us see what columns have missing values
    # Total number of entries (rows X columns) in the dataset
    total= df.size
    #Number of missing values per column
    missingCount = df.isnull().sum()
    #Total number of missing values
    missing_tot = missingCount.sum()
    # Calculate percentage of missing values
    print("The dataset contains", round(((missing_tot/total) * 100), 2), "%", "missing values")

    # keeping only the columns with missing values>0 
    missing = missingCount[missingCount > 0] 
    print(missing)
    # sorting in order of missing values and making the change to original missing series
    missing.sort_values(inplace=True) 
    missing.plot.bar()
    plt.title(title, size=15,loc='left')
    plt.xticks(fontsize=11,rotation=45)
    plt.yticks(fontsize=11)
    plt.xlabel(xlabel, fontsize=13)
    plt.ylabel(ylabel, fontsize=13)
    plt.show()
    
import matplotlib.pyplot as plt
plot_missingdata(df, 'Dataset columns with missing values','Column Name','No. of Missing values')
