# Linear Regression in Python


The task in this section is to construct a linear regression model to predict children's height using their parents' heights, their gender, and the number of children in the family using a modified version of the famous [Galton Height data set](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/T0HSJ1).

The modified version of the data set can be found [here](https://drive.google.com/file/d/1vXf3NHcTL7tJE1vFX3enuKTJHeVCnvPy/view?usp=share_link), and a description of the data can be found [here](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/T0HSJ1). Add the file
the folder you are using.


In [None]:
# imporrt libraries
# Import standard packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import modeling tools
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer

In [None]:
# set the default output to pandas
from sklearn import set_config
set_config(transform_output='pandas')

In [None]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Connect to drive and load data
# Load the modified galton height data
fpath = "/content/galton-height-raw.csv"
df = pd.read_csv(fpath)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 898 entries, 0 to 897
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   family  898 non-null    object 
 1   father  898 non-null    float64
 2   mother  898 non-null    float64
 3   gender  898 non-null    object 
 4   height  898 non-null    float64
 5   kids    898 non-null    int64  
dtypes: float64(3), int64(1), object(2)
memory usage: 42.2+ KB


Unnamed: 0,family,father,mother,gender,height,kids
0,1,78.5,67.0,M,73.2,4
1,1,78.5,67.0,F,69.2,4
2,1,78.5,67.0,F,69.0,4
3,1,78.5,67.0,F,69.0,4
4,2,75.5,66.5,M,73.5,4


### Explore the Data

In [None]:
# check for null values
df.isna().sum()

Unnamed: 0,0
family,0
father,0
mother,0
gender,0
height,0
kids,0


In [None]:
df['family'].unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24',
       '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35',
       '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46',
       '47', '48', '49', '51', '52', '53', '54', '55', '56', '57', '58',
       '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69',
       '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80',
       '81', '82', '83', '85', '86', '87', '88', '89', '90', '91', '92',
       '93', '94', '95', '96', '97', '98', '99', '100', '101', '102',
       '103', '104', '105', '106', '107', '108', '109', '110', '112',
       '113', '114', '115', '116', '117', '118', '119', '121', '122',
       '123', '124', '125', '126', '127', '128', '129', '130', '131',
       '132', '133', '134', '135', '136', '137', '138', '139', '140',
       '141', '142', '143', '144', '145', '146', '147', '148', '149',

In [None]:
df['gender'].unique()

array(['M', 'F'], dtype=object)

In [None]:
# Checking nuniuqe categories
df.select_dtypes('object').nunique()

Unnamed: 0,0
family,197
gender,2


There are two categorical features, but family has high cardinality; therefore, we will drop it.

In [None]:
# Drop family colum (high cardinality). - lead to overfitting
df = df.drop(columns='family')
df.head()

Unnamed: 0,father,mother,gender,height,kids
0,78.5,67.0,M,73.2,4
1,78.5,67.0,F,69.2,4
2,78.5,67.0,F,69.0,4
3,78.5,67.0,F,69.0,4
4,75.5,66.5,M,73.5,4


In [None]:
# Check for inconsistencies

# checking for inconsistent categories
df['gender'].value_counts()

Unnamed: 0_level_0,count
gender,Unnamed: 1_level_1
M,465
F,433


In [None]:
# checking for inconsistent numeric features
df.describe().round(2)

Unnamed: 0,father,mother,height,kids
count,898.0,898.0,898.0,898.0
mean,69.23,64.08,66.76,6.14
std,2.47,2.31,3.58,2.69
min,62.0,58.0,56.0,1.0
25%,68.0,63.0,64.0,4.0
50%,69.0,64.0,66.5,6.0
75%,71.0,65.5,69.7,8.0
max,78.5,70.5,79.0,15.0


### Data Preparation

In [None]:
# Separate features vs target & train/test split - aaranging the data into feature matrix (x) and target vector (y)
X = df.drop(columns = 'height')
#X = df[['father','mother','kids']]
y = df['height']

In [None]:
#splitting the data into training set and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) #default test size is 0.25
X_train.head()
#determine the length of each variable - length of the X_train to be equivalent to the length of y_train and length of X_test = len(y_test)

Unnamed: 0,father,mother,gender,kids
377,70.5,62.0,F,8
357,70.5,63.0,F,5
723,67.0,64.0,M,4
306,70.0,64.7,F,7
464,69.0,66.0,F,9


#### Preprocessing
You've learned many ways to preprocess data. In this case, all columns except Gender are numeric. There is no missing data.

The only preprocessing we need to do is:

- scaling the numeric features
- one-hot-encoding the categorical feature.
You can combine any preprocessing transformer, such as an OneHotEncoder, or just a StandardScaler, with your column transformer without needing individual pipelines for each datatype.   

**Create the Preprocessor ColumnTransformer**

In [None]:
# Get list of numeric columns
#write your code here
num_cols = X_train.select_dtypes('number').columns

#Instantiating a standard scaler
scaler = StandardScaler()

#constructing a tuple for column transformer with the scaler.
num_tuple = ('numeric', scaler, num_cols)
num_tuple

('numeric',
 StandardScaler(),
 Index(['father', 'mother', 'kids'], dtype='object'))

In [None]:
# Get list of categorical columns
#write your code here
cat_cols= X_train.select_dtypes('object').columns
cat_cols

#instantiate a oneHot encoder
encoder_gen = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

#tuple
cat_tuple =('categorical',encoder_gen, cat_cols)
cat_tuple

('categorical',
 OneHotEncoder(handle_unknown='ignore', sparse_output=False),
 Index(['gender'], dtype='object'))

In [None]:
# Instantiate the preprocessor/ColumnTransformer
#write your code here

preprocessor = ColumnTransformer([num_tuple, cat_tuple], verbose_feature_names_out=False)
preprocessor

In [None]:
# Data Transformation
# Fit the preprocessor on training data
#write your code here
preprocessor.fit(X_train)
#apply the fit method and transformation at the same time
preprocessor.fit_transform(X_train)
#train_test_split(variable1, y)

# Transform the training and test data
#write your code here

X_train_tf= preprocessor.transform(X_train)
X_test_tf= preprocessor.transform(X_test)


In [None]:
X_train.head(2)
X_test.head(2)

Unnamed: 0,father,mother,gender,kids
331,70.5,64.5,F,1
638,68.0,63.0,M,8


In [None]:
X_train_tf.head(2)
X_test_tf.head(2)

Unnamed: 0,father,mother,kids,gender_F,gender_M
331,0.513292,0.173627,-1.904297,1.0,0.0
638,-0.483608,-0.458911,0.706629,0.0,1.0


**Now you are ready to run your first model!**   

**Step 1: Import and instantiate the model.**  

Define the model with an appropriate name. Here we chose lin_reg. We instantiated the model with no arguments, so we will use the default model.

In [None]:
#importing the LR algorithm
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

#Instantiate the Linear Regression model
lin_reg = LinearRegression()


**Step 2: Train the model on your training data.**  

This is the step where the model "learns" about the relationship between the features (X) and the target (y).

In [None]:
# Fit the model on the training data
lin_reg.fit(X_train_tf, y_train)

**Understanding the Fit Step (Optional)**

​This is not required for the model, but it will help you understand what is happening during this step.  

​We can view the results of the fit step. First, let's obtain the y-intercept

y= m(gradient)x+c(intercept)

In [None]:
# View the intercept determined during the fit step
lin_reg.intercept_

np.float64(66.68792697250986)

Now let's obtain the coefficients. There will be one coefficient for each feature.

One variable(feature)
y = mx+ c

More than one variable (feature)

y = m1x1 + m2x2 + m3x3 + ...+ mnXn + C

In [None]:
X_train.head(2)

Unnamed: 0,father,mother,gender,kids
377,70.5,62.0,F,8
357,70.5,63.0,F,5


In [None]:
X_train_tf.head(2)

Unnamed: 0,father,mother,kids,gender_F,gender_M
377,0.513292,-0.880603,0.706629,1.0,0.0
357,0.513292,-0.458911,-0.412339,1.0,0.0


In [None]:
# View the coefficents determined during the fit step
#y = m1x1 +m2x2 +m3x3...+mnxn + b
lin_reg.coef_

array([ 1.05705547,  0.68828982, -0.09504525, -2.63426472,  2.63426472])

We can match these coefficients with the feature names that were used in the processed dataframe. Let's extra those from our preprocessor.

In [None]:
# Get a list of the feature names after processing
feature_names= preprocessor.get_feature_names_out()
feature_names

array(['father', 'mother', 'kids', 'gender_F', 'gender_M'], dtype=object)



Now we can create a Pandas Series of our coefficients using the feature names as the index.

In [None]:
# Create a Pandas Series with feature name and coefficient
coeffs= pd.Series(lin_reg.coef_, index= feature_names)
coeffs

Unnamed: 0,0
father,1.057055
mother,0.68829
kids,-0.095045
gender_F,-2.634265
gender_M,2.634265


The resulting equation is:
<img src="https://assets.codingdojo.com/boomyeah2015/codingdojo/curriculum/content/chapter/1685041059__1685038032regeqnbetas.png">




y_pred = 1.06(father)+0.69(mother)-0.09(kids)-2.63(gender_F)+2.63(gender_M)

We can fill in the values from our first row of test data to see what the prediction will be based on the equation:

In [None]:
# View the first row of features in the test set
X_test_tf.head(2)

Unnamed: 0,father,mother,kids,gender_F,gender_M
331,0.513292,0.173627,-1.904297,1.0,0.0
638,-0.483608,-0.458911,0.706629,0.0,1.0


Now, we can fill these values into our equation where y_pred is the predicted height.

y_pred =

The predicted height = 64.9

Now that the model has "learned" the equation, let's use it to make predictions.

Step 3: Use the model to make predictions for training and testing data

In [None]:
# Get predictions for the training data
y_predictions_train = lin_reg.predict(X_train_tf)
# Get predictions for the testing data
y_predictions_test = lin_reg.predict(X_test_tf)

In [None]:
# y_predictions_test
#lin_reg.fit(X_train_tf, y_train)
lin_reg.predict(X_test_tf)

array([64.89674079, 68.42796593, 69.77199091, 67.52457965, 65.3982589 ,
       69.77199091, 63.9229688 , 64.43450926, 69.50860661, 65.43651339,
       65.43651339, 64.91196894, 64.20563184, 64.16917541, 64.41623607,
       68.58609309, 61.79908786, 68.53099892, 69.21921433, 70.10542292,
       62.41749701, 64.66771188, 64.67544667, 64.19689149, 68.15157764,
       64.25725486, 66.88162181, 67.81482598, 65.73449451, 63.6571189 ,
       67.37613683, 65.36561163, 70.02873592, 64.66771188, 69.83430382,
       68.42796593, 72.47916812, 64.38851998, 70.53832683, 67.67411796,
       69.93011807, 63.37878106, 68.67612209, 62.76478706, 69.41279237,
       65.46142587, 72.1505478 , 64.20563184, 64.24007716, 67.9538259 ,
       64.66771188, 69.99429059, 64.28938609, 66.17580151, 64.68930471,
       63.95068488, 66.88201835, 69.7998585 , 64.24007716, 71.7286398 ,
       65.10640103, 63.19769097, 69.7998585 , 64.7142172 , 69.65704943,
       67.70847334, 66.88201835, 64.58295207, 68.23353384, 69.21

In [None]:
y_predictions_test.shape

(225,)

In [None]:
y_test.shape

(225,)

**Step 4: Evaluate the Results**

Note: We will learn evaluation strategies in the next lesson (we have covered these in the slides). For now, we will conceptualize the model's performance.

This is not required, but let's see how our predictions compare to the true values. We will use the testing data. In addition to the features, our data frame will show the True Height, the Predicted Height, and the difference between the two (this is the error in the prediction)

In [None]:
# Saving a copy of X_test_tf and adding the true and predicted price and the error
prediction_df = X_test_tf.copy()
prediction_df
prediction_df['True Height'] = y_test
prediction_df['Predicted Height'] = y_predictions_test.round(1)
prediction_df['Predicted Height'] = y_predictions_test.round(1)
prediction_df['Error'] = (y_predictions_test - y_test).round(1)
prediction_df

Unnamed: 0,father,mother,kids,gender_F,gender_M,True Height,Predicted Height,Error
331,0.513292,0.173627,-1.904297,1.0,0.0,60.0,64.9,4.9
638,-0.483608,-0.458911,0.706629,0.0,1.0,65.5,68.4,2.9
326,0.513292,-0.037219,0.706629,0.0,1.0,68.0,69.8,1.8
848,-1.679888,-0.037219,-0.039350,0.0,1.0,67.0,67.5,0.5
39,1.908952,-0.880603,0.706629,1.0,0.0,63.5,65.4,1.9
...,...,...,...,...,...,...,...,...
101,1.111432,-0.458911,-0.039350,1.0,0.0,70.0,64.9,-5.1
199,0.712672,-0.458911,1.079619,1.0,0.0,65.0,64.4,-0.6
851,-1.679888,-0.037219,-0.039350,1.0,0.0,64.0,62.3,-1.7
346,0.313912,-0.163727,0.706629,0.0,1.0,67.0,69.5,2.5


## Regression Metrics
Let us first get predictions and store them

In [None]:
#Get predictions for training and test data
y_pred_train = lin_reg.predict(X_train_tf)
y_pred_test = lin_reg.predict(X_test_tf)

### MAE in Python
We can calculate the mean absolute error using NumPy:

In [None]:
# Calculating MAE with numpy
#write your code here



In [None]:
from sklearn.metrics import mean_absolute_error, precision_score

precision_score(y_test, y_pred_test)

mean_absolute_error(y_test, y_pred_test)

ValueError: continuous is not supported

In [None]:
# Calculating MAE with sklearn
from sklearn.metrics import mean_absolute_error
train_MAE = mean_absolute_error(y_train, y_pred_train)
test_MAE = mean_absolute_error(y_test, y_pred_test)
print(f'Model Training MAE: {train_MAE:,.2f}')
print(f'Model Testing MAE: {test_MAE:,.2f}')

Model Training MAE: 1.70
Model Testing MAE: 1.68


## MSE in Python

In [None]:
# Calculating MSE with numpy
#write your code here

mse_test = np.mean((y_test - y_pred_test)**2)
print(mse_test)
mse_train= np.mean((y_train-y_pred_train)**2)
mse_train

4.779644178781558


np.float64(4.561920498160115)

In [None]:
# Calculating MSE with sklearn
from sklearn.metrics import mean_squared_error
train_MSE = mean_squared_error(y_train, y_pred_train)
test_MSE = mean_squared_error(y_test, y_pred_test)
print(f'Model Training MSE: {train_MSE:,.2f}')
print(f'Model Testing MSE: {test_MSE:,.2f}')

Model Training MSE: 4.56
Model Testing MSE: 4.78


## RMSE in Python

In [None]:
# Calculating RMSE with numpy
train_RMSE = np.sqrt(np.mean((y_pred_train - y_train)**2))
test_RMSE = np.sqrt(np.mean((y_pred_test - y_test)**2))
print(f'Model Training RMSE: {train_RMSE:,.2f}')
print(f'Model Testing RMSE: {test_RMSE:,.2f}')

Model Training RMSE: 2.14
Model Testing RMSE: 2.19


In [None]:
# Calculating RMSE with sklearn
train_RMSE = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_RMSE = np.sqrt(mean_squared_error(y_test, y_pred_test))
print(f'Model Training RMSE: {train_RMSE:,.2f}')
print(f'Model Testing RMSE: {test_RMSE:,.2f}')

Model Training RMSE: 2.14
Model Testing RMSE: 2.19


## R-squared in Python

In [None]:
# Calculating R-Squared
train_r2 = np.corrcoef(y_train, y_pred_train)[0][1]**2
test_r2 = np.corrcoef(y_test, y_pred_test)[0][1]**2
print(f'Model Training R2: {train_r2:.2f}')
print(f'Model Testing R2: {test_r2:.2f}')

Model Training R2: 0.65
Model Testing R2: 0.61


In [None]:
from sklearn.metrics import r2_score
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
print(f'Model Training R2: {train_r2:.2f}')
print(f'Model Testing R2: {test_r2:.2f}')

Model Training R2: 0.65
Model Testing R2: 0.60


#### Define function to return a dictionary of the evaluation metrics

In [None]:
#write your code here

### Function to accomplish evaluation in one block

Define the Function

Reading custom functions is a great way to get better at writing them!

In [None]:
#write your code here

In [None]:
 # Test function with default arguments
#write your code here

In [None]:
# Test function by saving results as a dataframe
#write your code here