# Multiple Linear Regression

## Data Pre-processing

In [None]:
#1. Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#2. Importing the dataset
df = pd.read_csv('Data/Position_Salaries.csv')
print("The dataset preview:\n")
print(df.head())

#3. Missing data
#Step 3.1.1: Is there any missing data?
df.info()
# Method 3.1.2: print number of null valyes
missing_values = df.isnull().sum()
print(missing_values)

df_encoded = df[["Level", "Salary"]]

#5. X and y values
X = df_encoded.iloc[:, :-1].values
y = df_encoded.iloc[:, -1].values

## Training the Single Linear Regression model on the Training set

In [None]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X, y)

## Predicting the Test set results

In [None]:
np.set_printoptions(precision=2)

In [None]:
y_pred_linear = regressor.predict(X)

In [None]:
y_pred_linear

## Visualization

In [None]:
plt.figure(figsize=(4, 3))
plt.scatter(X, y, color='blue')
plt.plot(X, y_pred_linear, color='red')
plt.title('Simple Linear Regression')
plt.xlabel(df_encoded.columns[0])
plt.ylabel(df_encoded.columns[1])
plt.show()

## Polynomial Regression

In [None]:
# Fitting Polynomial Regression to the dataset
from sklearn.preprocessing import PolynomialFeatures

# Polynomial Regression (degree=2)
poly_2 = PolynomialFeatures(degree=2)         # This is the power of "n", in the polynomial equation
X_poly_2 = poly_2.fit_transform(X)              # Turn a single feature "X", to a matrix of [X, X^2, ..... , X^n]

lin_for_ploy = LinearRegression()
lin_for_ploy.fit(X_poly_2, y)

In [None]:
# Prediction
y_pred_poly_2 = lin_for_ploy.predict(X_poly_2)

In [None]:
# Visualising the Polynomial Regression results
plt.scatter(X, y, color='blue')
plt.plot(X, y_pred_poly_2, color='red')
plt.title('Polynomial Regression (n=2)')
plt.xlabel(df_encoded.columns[0])
plt.ylabel(df_encoded.columns[1])
plt.show()

In [None]:
# Polynomial Regression (degree=3)
poly_3 = PolynomialFeatures(degree=3)
X_poly_3 = poly_3.fit_transform(X)
lin_for_ploy.fit(X_poly_3, y)
y_pred_poly_3 = lin_for_ploy.predict(X_poly_3)

In [None]:
# Polynomial Regression (degree=4)
poly_4 = PolynomialFeatures(degree=4)
X_poly_4 = poly_4.fit_transform(X)
lin_for_ploy.fit(X_poly_4, y)
y_pred_poly_4 = lin_for_ploy.predict(X_poly_4)

In [None]:
plt.scatter(X, y, color='blue')
# plt.plot(X, y_pred_poly_2, color='red', label='Poly n=2')
# plt.plot(X, y_pred_poly_3, color='green', label='Poly n=3')
plt.plot(X, y_pred_poly_4, color='brown', label='Poly n=4')
plt.title('Polynomial Regression (n=2,3,4)')
plt.xlabel(df_encoded.columns[0])
plt.ylabel(df_encoded.columns[1])
plt.legend()    # Show labels
plt.show()

### How to make the curve more smooth?

In [None]:
X_grid = np.arange(min(X[:, 0]), max(X[:, 0]), 0.1)
X_grid = X_grid.reshape((len(X_grid), 1))   #机器学习需要二维数组，所以进行调整而已

plt.scatter(X, y, color='blue')
# plt.plot(X, y_pred_poly_2, color='red', label='Poly n=2')
# plt.plot(X, y_pred_poly_3, color='green', label='Poly n=3')
plt.plot(X_grid, lin_for_ploy.predict(poly_4.fit_transform(X_grid)), color='brown', label='Poly n=4')
plt.title('Polynomial Regression (n=2,3,4)')
plt.xlabel(df_encoded.columns[0])
plt.ylabel(df_encoded.columns[1])
plt.legend()    # Show labels
plt.show()

### Predict a single point

In [55]:
# Linear
regressor.predict([[6.5]])

array([335474.56])

In [58]:
# Polynomial
# lin_for_ploy = LinearRegression()
lin_for_ploy.fit(X_poly_4, y)
lin_for_ploy.predict(poly_4.fit_transform([[6.5]]))

array([158862.45])