# Chapter 5 - Solving Regression Problems with Scikit Learn

## 5.1. Preparing Data for Regression Problems

In [2]:
# importing required libraries
import pandas as pd
import numpy as np
import seaborn as sns

#get dataset names from the seaborn library
sns.get_dataset_names()



  gh_list = BeautifulSoup(http)


['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'exercise',
 'flights',
 'fmri',
 'gammas',
 'geyser',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'tips',
 'titanic']

In [3]:
# importing the tips dataset
tips_df = sns.load_dataset("tips")

#printing dataset header
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
# importing the tips dataset
diamond_df = sns.load_dataset("diamonds")

#printing dataset header
diamond_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


### 5.1.1. Dividing Data into Features and Labels

In [5]:
#extracting features
X = tips_df.drop(['tip'], axis=1)

#extracting labels
y = tips_df["tip"]


In [6]:
#printing features
X.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4


In [7]:
#priting labels header
y.head()

0    1.01
1    1.66
2    3.50
3    3.31
4    3.61
Name: tip, dtype: float64

### 5.1.2. Converting Categorical Data to Numbers



In [8]:
#removing categorical features
numerical = X.drop(['sex', 'smoker', 'day', 'time'], axis = 1)


In [9]:
#printint numeric features only
numerical.head()

Unnamed: 0,total_bill,size
0,16.99,2
1,10.34,3
2,21.01,3
3,23.68,2
4,24.59,4


In [11]:
#filtering categorical features
categorical = X.filter(['sex', 'smoker', 'day', 'time'])

#printing categorical features header
categorical.head()

Unnamed: 0,sex,smoker,day,time
0,Female,No,Sun,Dinner
1,Male,No,Sun,Dinner
2,Male,No,Sun,Dinner
3,Male,No,Sun,Dinner
4,Female,No,Sun,Dinner


In [12]:
#performgin one hot encoding
import pandas as pd
cat_numerical = pd.get_dummies(categorical,drop_first=True)
cat_numerical.head()

Unnamed: 0,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,1,1,0,0,1,1
1,0,1,0,0,1,1
2,0,1,0,0,1,1
3,0,1,0,0,1,1
4,1,1,0,0,1,1


In [13]:
#concating numeric and one hot encoded features

X = pd.concat([numerical, cat_numerical], axis = 1)
X.head()

Unnamed: 0,total_bill,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,2,1,1,0,0,1,1
1,10.34,3,0,1,0,0,1,1
2,21.01,3,0,1,0,0,1,1
3,23.68,2,0,1,0,0,1,1
4,24.59,4,1,1,0,0,1,1


### 5.1.3. Divide Data into Training and Test Sets

In [14]:
#dividing data into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.20, random_state=0)

### 5.1.4. Data Scaling/Normalization

In [15]:
# feature scaling using standard scaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)

## 5.2. Single Output Regression Problems

### 5.2.1. Linear Regression 

In [17]:
#importing linear regression model from sklearn
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

#training the model
regressor = lin_reg.fit(X_train, y_train)

#making predictions on the test set
y_pred = regressor.predict(X_test)

#evaluating model performance
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.7080218832979829
Mean Squared Error: 0.893919522160961
Root Mean Squared Error: 0.9454731736865732


### 5.2.2. KNN Regression

In [18]:
#importing the KNN model from Sklearn
from sklearn.neighbors import KNeighborsRegressor
knn_reg = KNeighborsRegressor(n_neighbors=5)

#training the model
regressor = knn_reg.fit(X_train, y_train)

#making predictions
y_pred = regressor.predict(X_test)

#evaluating model performance
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.7513877551020406
Mean Squared Error: 0.9462902040816326
Root Mean Squared Error: 0.9727744877830794


### 5.2.3. Random Forest Regression 

In [16]:
#importing the random forest algorithm from Sklearn
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(random_state=42, n_estimators=500)

#training the model
regressor = rf_reg.fit(X_train, y_train)

#making predicitons on the test set
y_pred = regressor.predict(X_test)


#evaluating the model performance
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.7045665306122448
Mean Squared Error: 0.8020627395265322
Root Mean Squared Error: 0.8955795551074913


### 5.2.4 K Fold Cross Validation

In [17]:
#importing cross validation model from the sklearn
from sklearn.model_selection import cross_val_score

#applying cross validation with 5 folds
print(cross_val_score(regressor, X, y, cv=5, scoring ="neg_mean_absolute_error"))

[-0.7875649  -0.59598449 -0.69376163 -0.99008286 -0.89050833]


### 5.2.5. Predicting a Single Value

In [18]:
#printing information about the 101th record in the dataset
tips_df.loc[100]

total_bill     11.35
tip              2.5
sex           Female
smoker           Yes
day              Fri
time          Dinner
size               2
Name: 100, dtype: object

In [23]:
# importing random forest regressor from sklearn
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(random_state=42, n_estimators=500)

#training the algorithm on training set
regressor = rf_reg.fit(X_train, y_train)

#making predictions on the 101th record from the dataset
single_record = sc.transform (X.values[100].reshape(1, -1))
predicted_tip = regressor.predict(single_record)

#printing the predicted value
print(predicted_tip)

[2.2609]


## 5.3. Multioutput Regression Problems

In [35]:
# example of multioutput regression problem
from sklearn.datasets import make_regression

# create dummy dataset
X, y = make_regression(n_samples=2000, n_features=8, n_informative=4, n_targets=3, random_state=42, noise=0.3)

# print dataset shape
print(X.shape, y.shape)



(2000, 8) (2000, 3)


In [36]:
#dividing data into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.20, random_state=0)

# feature scaling using standard scaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)

### 5.3.1 Linear Regression for Multiclass Output

In [24]:
#importing linear regression model from sklearn
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

#training the model
regressor = lin_reg.fit(X_train, y_train)

#making predictions on the test set
y_pred = regressor.predict(X_test)

#evaluating model performance
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.24400622004095515
Mean Squared Error: 0.09288200051053495
Root Mean Squared Error: 0.3047654844475256


In [32]:
#making predictions on the 51st record from the test set
single_record = sc.transform (X_test[50].reshape(1, -1))
predicted_val = regressor.predict(single_record)

#printing the predicted value
print(predicted_val)

#printing the actual value
print(y_test[50])

[[ 52.14499321 154.07153888  29.65411176]]
[ 50.3331556  155.43458476  26.52621361]


### 5.3.2. Random Forest Regression for Multiclass Output

In [37]:
#importing the random forest algorithm from Sklearn
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(random_state=42, n_estimators=500)

#training the model
regressor = rf_reg.fit(X_train, y_train)

#making predicitons on the test set
y_pred = regressor.predict(X_test)


#evaluating the model performance
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 17.578462377518566
Mean Squared Error: 737.569952450891
Root Mean Squared Error: 27.158239126476722


In [38]:
#making predictions on the 51st record from the test set
single_record = sc.transform (X_test[50].reshape(1, -1))
predicted_val = regressor.predict(single_record)

#printing the predicted value
print(predicted_val)

#printing the actual value
print(y_test[50])

[[ 15.29925902 114.41624666  12.90183432]]
[ 50.3331556  155.43458476  26.52621361]


### 5.3.3. Direct Multioutput Regression with Wrapper Algorithms

In [39]:
#importing MultiputputRegressor and LinearSVR from the Sklearn library
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import LinearSVR

svr_reg = LinearSVR()
# define the direct multioutput wrapper model
wrap_clf = MultiOutputRegressor(svr_reg)

#training the model
regressor = wrap_clf.fit(X_train, y_train)

#making predicitons on the test set
y_pred = regressor.predict(X_test)


#evaluating the model performance
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.24566521365979566
Mean Squared Error: 0.09412825912574384
Root Mean Squared Error: 0.30680329060449113


In [40]:
#making predictions on the 51st record from the test set
single_record = sc.transform (X_test[50].reshape(1, -1))
predicted_val = regressor.predict(single_record)

#printing the predicted value
print(predicted_val)

#printing the actual value
print(y_test[50])

[[ 52.10616073 154.0113967   29.64235478]]
[ 50.3331556  155.43458476  26.52621361]


### 5.3.4. Chained Multioutput Regression with Wrapper Algorithms

In [43]:
#importing Multioutput Regressor and LinearSVR from the Sklearn library
from sklearn.multioutput import RegressorChain
from sklearn.svm import LinearSVR

svr_reg = LinearSVR()
# define the direct multioutput wrapper model
wrap_clf = RegressorChain(svr_reg, order=[0,1,2])

#training the model
regressor = wrap_clf.fit(X_train, y_train)

#making predicitons on the test set
y_pred = regressor.predict(X_test)


#evaluating the model performance
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.2999883276581629
Mean Squared Error: 0.14873277575291557
Root Mean Squared Error: 0.3856588852249039




In [44]:
#making predictions on the 51st record from the test set
single_record = sc.transform (X_test[50].reshape(1, -1))
predicted_val = regressor.predict(single_record)

#printing the predicted value
print(predicted_val)

#printing the actual value
print(y_test[50])

[[ 52.11002869 154.00609972  29.12383881]]
[ 50.3331556  155.43458476  26.52621361]


## Exercise 5.1


### Question 1

Which of the following is an example of a regression output:

A- True \
B- Red \
C- 2.5 \
D- None of the above

Answer: C
    
    
### Question 2

Which of the following algorithm is a lazy algorithm?

A- Random Forest \
B- KNN \
C- SVM \
D- Linear Regression

Answer: B


### Question 3

Which of the following algorithm is not a regression metric?

A- Accuracy \
B- Recall \
C- F1 Measure \
D- All of the above

Answer: D

## Exercise 5.2

Using the `diamonds` dataset from seaborn library. Train a regression algorithm of your choice which predicts the price of the diamond. Perform all the preprocessing steps.

### Solution:

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

diamonds_df = sns.load_dataset("diamonds")

X = diamonds_df.drop(['price'], axis=1)
y = diamonds_df["price"]

numerical = X.drop(['cut', 'color', 'clarity'], axis = 1)

categorical = X.filter(['cut', 'color', 'clarity'])

cat_numerical = pd.get_dummies(categorical,drop_first=True)

X = pd.concat([numerical, cat_numerical], axis = 1)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.20, random_state=0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)

from sklearn import svm
svm_reg = svm.SVR()
regressor = svm_reg.fit(X_train, y_train)
y_pred = regressor.predict(X_test)



from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 1719.4644618294901
Mean Squared Error: 10457583.658419697
Root Mean Squared Error: 3233.8187423570444
