### Logistic Regression

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

mnist = fetch_openml('mnist_784')
x = mnist.data
y = mnist.target

Analyze the data structure

In [None]:
# Print to show there are 1797 images (8 by 8 images for a dimensionality of 64)
print("Image Data Shape" , mnist.data.shape)

# Print to show there are 1797 labels (integers from 0-9)
print("Label Data Shape", mnist.target.shape)

In [None]:
x

Split train and test data

In [None]:
train_img, test_img, train_lbl, test_lbl = train_test_split(
    mnist.data, mnist.target, test_size=1/7.0, random_state=0)

train_lbl.values[0:5]

In [None]:
test_img.shape

Plot some of the training samples

In [None]:
import numpy as np
import matplotlib.pyplot as plt


plt.figure(figsize=(20,4))
for index, (image, label) in enumerate(zip(train_img.values[0:5], train_lbl.values[0:5])):
    plt.subplot(1, 5, index + 1)
    plt.imshow(np.reshape(image, (28,28)), cmap=plt.cm.gray)
    plt.title('Training: %i\n' % int(label), fontsize = 20)

Import LogisticRegression module from sklearn, create a model and use the training data to fit the model

In [None]:
from sklearn.linear_model import LogisticRegression

logisticRegr = LogisticRegression(solver = 'saga', tol=1e-2, max_iter=1000)

logisticRegr.fit(train_img.values, train_lbl.values)

 We can now predict the outcome for one...

In [None]:
# Returns a NumPy Array
print(f"actual: {train_lbl.values[0]}   |   prediction: {logisticRegr.predict(train_img.values[0].reshape(1,-1))}")

or multiple observations from the training dataset:

In [None]:
print(f"actual values: {train_lbl.values[0:10]}")
print(f"predicted values: {logisticRegr.predict(train_img.values[0:10])}")

What we are really after though is the accuracy of our trained classifier on previously unseen data, that is what we created our test dataset for:

In [None]:
# Make predictions on entire test data
predictions = logisticRegr.predict(test_img.values)

score = logisticRegr.score(test_img.values, test_lbl.values)
print(f"accuracy: {score}")

Here we predict the probability of each class for a single image:

In [None]:
prediction_proba = logisticRegr.predict_proba(test_img)
print(f"prediction probabilities: {prediction_proba[0]}")
print(f"sum over prediction probabilities: {sum(prediction_proba[0])}")

In order to display misclassified results, we first need to know which images are actually wrongly classified.

In [None]:
index = 0
misclassifiedIndexes = []
for label, predict in zip(test_lbl.values, predictions):
    if label != predict: 
        misclassifiedIndexes.append(index)
    index +=1

Print the previously filtered images

In [None]:
plt.figure(figsize=(20,4))
for plotIndex, badIndex in enumerate(misclassifiedIndexes[0:5]):
    plt.subplot(1, 5, plotIndex + 1)
    plt.imshow(np.reshape(test_img.values[badIndex], (28,28)), cmap=plt.cm.gray)
    plt.title('Predicted: {}, Actual: {}'.format(predictions[badIndex], test_lbl.values[badIndex]), fontsize = 15)

In [None]:
prediction_proba[misclassifiedIndexes[0:5]][4]

### Polynomial regression
Why and when use Polynomial Regression instead of Linear Regression?

"There are 3 main situations that would warrant a Polynomial Regression over Linear:

    The theoretical reason. The researcher (you) may hypothesise that the data will be curvilinear, in which case you should obviously fit it with a curve.
    Upon a visual inspection of your data, a curvilinear relationship may be revealed. This could be achieved by a simple scatter plot (which is why you should always perform univariate and bivariate inspections of your data before applying a Regression Analysis).
    Inspecting the model’s residuals. Attempting to fit a linear model to curvilinear data will result in high positive and negative residuals, and a low R² score." 
[[1]](https://towardsdatascience.com/polynomial-regression-the-only-introduction-youll-need-49a6fb2b86de)

Create data and split it into train and test dataset:

In [None]:
x = np.arange(0, 30)
y = [3, 4, 5, 7, 10, 8, 9, 10, 10, 23, 27, 44, 50, 63, 67, 60, 62, 70, 75, 88, 81, 87, 95, 100, 108, 135,
151, 160, 169, 179]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=21)

Plot the data:

In [None]:
plt.scatter(x, y, c='#8acfd4', label='All data')
plt.legend(loc="upper left")
plt.show()

Plot train and test data

In [None]:
plt.scatter(x_train, y_train, c='#8acfd4', label='Training data')
plt.scatter(x_test, y_test, c='#edbf6f', label='Testing data')
plt.legend(loc="upper left")
plt.show()

#### Generate polynomial features

In [None]:
from sklearn.preprocessing import PolynomialFeatures

Reshape data

In [None]:
x_train = np.array(x_train).reshape(-1,1)
x_test = np.array(x_test).reshape(-1,1)
y_train = np.array(y_train).reshape(-1,1)
y_test = np.array(y_test).reshape(-1,1)

y_train = y_train[x_train[:,0].argsort()]
y_test = y_test[x_test[:,0].argsort()]
x_train = x_train[x_train[:,0].argsort()]
x_test = x_test[x_test[:,0].argsort()]

# two possibilities to reach the same goal:

print("x_train.shape: {}  y_train.shape: {}".format(x_train.shape,y_train.shape))
print("x_train.shape: ({}, {})  y_train.shape: ({}, {})".format(len(x_train),len(x_train[0]),len(y_train),len(y_train[0])))

print("x_test.shape: {}  y_test.shape: {}".format(x_test.shape,y_test.shape))
print("x_test.shape: ({}, {})  y_test.shape: ({}, {})".format(len(x_test),len(x_test[0]),len(y_test),len(y_test[0])))

Create polynomial features objects and transform data

In [None]:
poly = PolynomialFeatures(degree=2)
x_train_poly = poly.fit_transform(x_train)
# x_test_poly = poly.transform(x_test)

poly5 = PolynomialFeatures(degree=5)
x_train_poly5 = poly5.fit_transform(x_train)

poly10 = PolynomialFeatures(degree=10)
x_train_poly10 = poly10.fit_transform(x_train)

Create models

In [None]:
from sklearn.linear_model import LinearRegression

poly_reg = LinearRegression()
poly_reg.fit(x_train_poly, y_train)

poly5_reg = LinearRegression()
poly5_reg.fit(x_train_poly5, y_train)

poly10_reg = LinearRegression()
poly10_reg.fit(x_train_poly10, y_train)

lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)

Plot models

In [None]:
plt.title('Polynomial regression example')
plt.xlabel('x')
plt.ylabel('y')
plt.scatter(x_train.reshape(-1,), y_train.reshape(-1,), c='#8acfd4', label='Training data')
plt.scatter(x_test.reshape(-1,), y_test.reshape(-1,), c='#edbf6f', label='Testing data')
plt.plot(x_train, poly_reg.predict(x_train_poly), c='#a3cfa3', label='Polynomial regression line (2 degress)')
plt.plot(x_train, poly5_reg.predict(x_train_poly5), c='#0FF0FF', label='Polynomial regression line (5 degrees)')
plt.plot(x_train, poly10_reg.predict(x_train_poly10), c='r', label='Polynomial regression line (10 degrees)')
plt.plot(x_train, lin_reg.predict(x_train), c='#ffcff3', label='Linear regression line')
plt.legend(loc="upper left")
plt.show()

#### numpy.polyfit

In [None]:
x = np.arange(0, 30)
y = [3, 4, 5, 7, 10, 8, 9, 10, 10, 23, 27, 44, 50, 63, 67, 60, 62, 70, 75, 88, 81, 87, 95, 100, 108, 135,
151, 160, 169, 179]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=21)

x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

y_train = y_train[x_train[:,].argsort()]
y_test = y_test[x_test[:,].argsort()]
x_train = x_train[x_train[:,].argsort()]
x_test = x_test[x_test[:,].argsort()]

x_train_resh = x_train.reshape(-1,1)
x_test_resh = x_test.reshape(-1,1)
y_train_resh = y_train.reshape(-1,1)
y_test_resh = y_test.reshape(-1,1)

plt.title('Polynomial regression example')
plt.xlabel('x')
plt.ylabel('y')
plt.scatter(x_train, y_train, c='#8acfd4', label='Training data')
plt.scatter(x_test, y_test, c='#edbf6f', label='Testing data')
plt.show()

In [None]:
poly = PolynomialFeatures(degree=2)
# x_train_poly = poly.fit_transform(x_train)

poly10_reg = LinearRegression()
poly10_reg.fit(x_train_poly10, y_train)

x_train_poly_np = np.polyfit(x_train,y_train, 10)
x_test_poly_np = np.polyfit(x_test,y_test, 3)

poly2d_fn = np.poly1d(x_train_poly_np)

plt.title('Polynomial regression example')
plt.xlabel('x')
plt.ylabel('y')
plt.scatter(x_train, y_train, c='#8acfd4', label='Training data')
plt.scatter(x_test, y_test, c='#edbf6f', label='Testing data')
plt.plot(x,poly2d_fn(x), label='Polynomial regression line (10 degrees) (np.polyfit)')
plt.plot(x_train, poly10_reg.predict(x_train_poly10), c='r', label='Polynomial regression line (10 degrees) (PolynomialFeatures)')
plt.legend(loc="best")
plt.show()

In [None]:
plt.title('Polynomial regression example')
plt.xlabel('x')
plt.ylabel('y')
plt.scatter(x_train, y_train, c='#8acfd4', label='Training data')
plt.scatter(x_test, y_test, c='#edbf6f', label='Testing data')
plt.plot(x_train, poly_reg.predict(x_train_poly), c='#a3cfa3', label='Polynomial regression line (2 degress)')
plt.plot(x_train, poly5_reg.predict(x_train_poly5), c='#0FF0FF', label='Polynomial regression line (5 degrees)')
# plt.plot(x_train, poly10_reg.predict(x_train_poly10), c='r', label='Polynomial regression line (10 degrees)')
# plt.plot(x_train, lin_reg.predict(x_train), c='#ffcff3', label='Linear regression line')
plt.legend(loc="upper left")
plt.show()