Model building

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('OnlineNewsPopularity.csv')

#### 12.0: Data cleaning

In [2]:
df.columns = df.columns.str.strip()

In [3]:
df = df.drop(columns=['url', 'timedelta'])

### 12.3: Ridge and Lasso regression

In [4]:
X, y = df.drop(columns=['shares']), df['shares']

In [5]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=79)

In [7]:
clf = Ridge(alpha=0.1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Mean absolute error: %.2f' % mean_absolute_error(y_test, y_pred))

Mean absolute error: 3089.59


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


In [8]:
from sklearn.linear_model import Lasso

In [9]:
clf2 = Lasso(alpha=0.1)
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print('Mean absolute error: %.2f' % mean_absolute_error(y_test, y_pred))

Mean absolute error: 3088.26


  model = cd_fast.enet_coordinate_descent(


Overall, eventhough Lasso regression had better result, the performance gain is marginal and so, both model have similar performance.

### 12.4: Effects of scalers

StandardScaler scales the data so that it has a mean of 0 and a standard deviation of 1. It is a popular choice for normalizing the features in a dataset as it preserves the shape of the distribution and is less sensitive to outliers compared to other scaling methods.
StandardScaler uses the following formula to transform each data point:
$$x_{scaled} = \frac{x - \mu}{\sigma}$$

MinMaxScaler scales the data to a fixed range of 0 to 1. It is particularly useful when you need to preserve the shape of the original distribution while ensuring that all values lie within a specific range.
$$x_{scaled} = \frac{x - x_{min}}{x_{max} - x_{min}}$$

RobustScaler scales the data using the median and interquartile range (IQR) rather than the mean and standard deviation. This makes it a good choice when the data contains outliers or has a skewed distribution. RobustScaler scales the data to lie between -1 and 1, making it a good choice for algorithms that assume normality of the data, such as linear regression. the formula for RobustScaler is:
$$x_{scaled} = \frac{x - median(x)}{IQR(x)}$$

In [10]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
clf = Ridge(alpha=0.1)
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print('Mean absolute error: %.2f'
      % mean_absolute_error(y_test, y_pred))

Mean absolute error: 3089.67


In [13]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
clf = Ridge(alpha=0.1)
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print('Mean absolute error: %.2f'
      % mean_absolute_error(y_test, y_pred))

Mean absolute error: 3087.78


In [15]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [16]:
clf = Ridge(alpha=0.1)
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print('Mean absolute error: %.2f' % mean_absolute_error(y_test, y_pred))

Mean absolute error: 3089.70


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


As you can see StandardScaler and RobustScaler performed almost the same (probably due to almost normal distribution of data) followed by MinMaxScaler.

### 12.5: Effect of polynominal features

In [17]:
from sklearn.preprocessing import PolynomialFeatures

In [18]:
poly = PolynomialFeatures(2)
Xp = poly.fit_transform(X)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(Xp, y, test_size=0.2, random_state=79)
clf = Ridge(alpha=0.1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Mean absolute error: %.2f' % mean_absolute_error(y_test, y_pred))

Mean absolute error: 13065.25


The error after adding polynominal features increased, meaning the model got overfitted.

### 12.6: Hyperparameter tuning

In [20]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [21]:
# Create a Ridge regression model
ridge = Ridge()

# Define the hyperparameters to tune
params = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(ridge, params, cv=5)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print(grid_search.best_params_)

# Use the best hyperparameters to fit the model
ridge = Ridge(alpha=grid_search.best_params_['alpha'])
ridge.fit(X_train, y_train)

# Use the model to make predictions on the testing set
y_pred = ridge.predict(X_test)

# Print the mean squared error
print('Mean squared error: %.2f' % mean_absolute_error(y_test, y_pred))

In [None]:
# Define the hyperparameters and their ranges to tune for Ridge regression
params = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
          'max_iter': [100, 1000, 10000]}

# Use RandomizedSearchCV to find the best hyperparameters for Ridge regression
random_search = RandomizedSearchCV(ridge, params, cv=5, n_iter=10, random_state=42)
random_search.fit(X_train, y_train)

# Print the best hyperparameters
print(random_search.best_params_)

# Use the best hyperparameters to fit the model
ridge = Ridge(alpha=random_search.best_params_['alpha'],
              max_iter=random_search.best_params_['max_iter'])
ridge.fit(X_train, y_train)

# Use the model to make predictions on the testing set
y_pred = ridge.predict(X_test)

# Print the mean squared error
print('Mean absolute error: %.2f' % mean_absolute_error(y_test, y_pred))

  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=

{'max_iter': 1000, 'alpha': 100}


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


Mean absolute error: 3825.88
