# Feature engineering

In [None]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline


## Create a random dataset

In [None]:
np.random.seed(3)
num_samples = 20
X = np.random.choice(np.linspace(0, 1000, 10000), num_samples).reshape(-1, 1)
y = 20 + 2**(0.008 * X)
y += np.random.normal(scale=15, size=y.shape)
plt.scatter(X, y)


## Regressor

In [None]:
rg = LinearRegression()
rg.fit(X, y)

x_ls = np.linspace(0, 1000, 1000).reshape(-1, 1)
y_pred = rg.predict(x_ls)

plt.scatter(X, y, color="blue")
plt.plot(x_ls, y_pred, color="red")
plt.show()


So that regressor obviously doesnt work on that kind of data.

In [None]:
rg = LinearRegression()
rg.fit(X, np.log(y))

x_ls = np.linspace(0, 1000, 1000).reshape(-1, 1)
y_pred = rg.predict(x_ls)

plt.scatter(X, y, color="blue")
plt.plot(x_ls, np.exp(y_pred), color="red")
plt.show()


Polynomial feature - create new features from polynomial combinations of the original features. E.g. if a and b are inputs, the new features generated by polynomial features of degree 2 are [1, a, b, a^2, ab, b^2]

In [None]:

polynomial_features = PolynomialFeatures(degree=32, include_bias=False)
linear_regression = LinearRegression()
    
pipeline = Pipeline(
        [
            ("polynomial_features", polynomial_features),
            ("linear_regression", linear_regression),
        ]
    )
pipeline.fit(X, y)

x_ls = np.linspace(150, 990, 1000).reshape(-1, 1)
y_pred = pipeline.predict(x_ls)

plt.scatter(X, y, color="blue")
plt.plot(x_ls, y_pred, color="red")
plt.show()
