In [21]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [15]:
# fetch dataset 
concrete_compressive_strength = fetch_ucirepo(id=165) 

# data (as pandas dataframes) 
X = concrete_compressive_strength.data.features 
y = concrete_compressive_strength.data.targets 

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 24)

In [23]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
r2_score(y_test, y_pred)

0.5771752777048791

In [25]:
poly = PolynomialFeatures(degree=2, include_bias=False).set_output(transform='pandas')
X_poly_trn = poly.fit_transform(X_train)
X_poly_tst = poly.transform(X_test)
lr.fit(X_poly_trn, y_train)
y_pred = lr.predict(X_poly_tst)
r2_score(y_test, y_pred)

0.7807023265195984

In [27]:
poly = PolynomialFeatures(degree=3, include_bias=False).set_output(transform='pandas')
X_poly_trn = poly.fit_transform(X_train)
X_poly_tst = poly.transform(X_test)
lr.fit(X_poly_trn, y_train)
y_pred = lr.predict(X_poly_tst)
r2_score(y_test, y_pred)

0.8622817297882757

In [29]:
poly = PolynomialFeatures(degree=4, include_bias=False).set_output(transform='pandas')
X_poly_trn = poly.fit_transform(X_train)
X_poly_tst = poly.transform(X_test)
lr.fit(X_poly_trn, y_train)
y_pred = lr.predict(X_poly_tst)
r2_score(y_test, y_pred)

-128.64221126252144

## Using Pipeline

In [31]:
from sklearn.pipeline import Pipeline
poly = PolynomialFeatures(degree=3, include_bias=False).set_output(transform='pandas')
lr = LinearRegression()

pipe = Pipeline([('POLY', poly), ('LR', lr)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.8622817297882757

# Housing Dataset

In [39]:
housing = pd.read_csv("Housing.csv")
housing.head()

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea
0,42000.0,5850,3,1,2,yes,no,yes,no,no,1,no
1,38500.0,4000,2,1,1,yes,no,no,no,no,0,no
2,49500.0,3060,3,1,1,yes,no,no,no,no,0,no
3,60500.0,6650,3,1,2,yes,yes,no,no,no,0,no
4,61000.0,6360,2,1,1,yes,no,no,no,no,0,no


In [41]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 546 entries, 0 to 545
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   price     546 non-null    float64
 1   lotsize   546 non-null    int64  
 2   bedrooms  546 non-null    int64  
 3   bathrms   546 non-null    int64  
 4   stories   546 non-null    int64  
 5   driveway  546 non-null    object 
 6   recroom   546 non-null    object 
 7   fullbase  546 non-null    object 
 8   gashw     546 non-null    object 
 9   airco     546 non-null    object 
 10  garagepl  546 non-null    int64  
 11  prefarea  546 non-null    object 
dtypes: float64(1), int64(5), object(6)
memory usage: 51.3+ KB


In [43]:
X = housing[['driveway']]
y = housing["price"]

## OneHotEncoding

In [None]:
# .getdummies() from pandas

In [56]:
dum_X = pd.get_dummies(X)
print(dum_X['driveway_no'].value_counts())
print(dum_X['driveway_yes'].value_counts())

driveway_no
False    469
True      77
Name: count, dtype: int64
driveway_yes
True     469
False     77
Name: count, dtype: int64


In [62]:
# dum_X = pd.get_dummies(X, drop_first=True) # drop_first=True to drop first column OR to reduce redudancy
dum_X = dum_X.drop("driveway_yes", axis = 'columns')
dum_X

Unnamed: 0,driveway_no
0,False
1,False
2,False
3,False
4,False
...,...
541,False
542,False
543,False
544,False


In [64]:
lr = LinearRegression()
lr.fit(dum_X,y)
lr.intercept_, lr.coef_

(71333.89552238806, array([-22778.11630161]))

In [66]:
housing.groupby('driveway')['price'].mean()

driveway
no     48555.779221
yes    71333.895522
Name: price, dtype: float64

#### Another method of One hot encoding using scikit learn

In [75]:
from sklearn.preprocessing import OneHotEncoder
# It is a data transformation class
# drop = 'first' -> drops first column
# sparse_row 
# dtype = "", desired dtype of output
# handle_unknown = specifies the way unknown categories are handled during transform.
# unknown catetory: a category which is not seen during the training but encountered during testing



In [87]:
ohe = OneHotEncoder(
    sparse_output = False,
    drop = 'first', # Drops first column
).set_output(transform = 'pandas')
# If sparse_output = False, it returns numpy array
dum_X = ohe.fit_transform(X)

In [89]:
lr = LinearRegression()
lr.fit(dum_X, y)
lr.intercept_, lr.coef_

(48555.77922077922, array([22778.11630161]))

#### For multiple variables

In [112]:
# The One hot encoder does the hot encoding of every single column, including numeric values.

# We do not want to one hot encode any numeric values

In [135]:
X =housing[['driveway', 'airco', 'lotsize']]
y = housing['price']

ohe = OneHotEncoder(
    sparse_output = False,
    drop = 'first'
).set_output(transform = 'pandas')

dum_X = ohe.fit_transform(X)
lr = LinearRegression()
lr.fit(dum_X, y)
lr. intercept_, lr.coef_

(44999.999999999854,
 array([ 1.09342964e+04,  1.61790177e+04, -2.89342964e+04, -1.25000000e+04,
         1.70000000e+04, -9.96714818e+03, -1.79342964e+04, -9.34296356e+02,
         5.00000000e+03, -4.41762969e+03, -1.00000000e+03, -1.79017701e+02,
        -9.34296356e+02, -2.01133141e+04,  1.50000000e+04, -1.13792964e+04,
        -2.28446482e+04, -7.00000000e+03, -2.19342964e+04, -1.41790177e+04,
        -7.27901770e+03,  9.06570364e+03,  4.03285182e+03, -4.50000000e+03,
        -1.59342964e+04, -2.61133141e+04, -5.00000000e+01,  1.50000000e+04,
        -8.68429636e+03, -4.15665703e+03,  3.35000000e+04, -1.29342964e+04,
         7.00000000e+03, -1.93429636e+03,  1.00657036e+04, -1.01714818e+03,
        -1.39000000e+04, -1.21133141e+04,  1.40657036e+04, -1.85000000e+04,
        -1.09423808e+04, -5.93429636e+03, -4.00000000e+03, -6.43429636e+03,
        -1.09342964e+04, -6.96714818e+03,  3.00000000e+03,  9.88567881e+02,
         1.86309740e+03, -8.21331406e+03, -4.69292850e+03, -1.63606

#### Column Transformers

In [129]:
# Column transformers are used in cases where we need to seperate the numeric columns from non-numeric ones.
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

In [117]:
housing.dtypes

price       float64
lotsize       int64
bedrooms      int64
bathrms       int64
stories       int64
driveway     object
recroom      object
fullbase     object
gashw        object
airco        object
garagepl      int64
prefarea     object
dtype: object

In [123]:
num_cols = housing.columns[housing.dtypes != object].tolist()
obj_cols = housing.columns[housing.dtypes == object].tolist()
ohe = OneHotEncoder(
    sparse_output= False,
    drop = 'first',
).set_output(transform = "pandas")



In [133]:
ct = make_column_transformer(('passthrough', num_cols), (ohe, obj_cols),
                             verbose_feature_names_out = False
                            ).set_output(transform = "pandas")
ct.fit_transform(housing)

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,garagepl,driveway_yes,recroom_yes,fullbase_yes,gashw_yes,airco_yes,prefarea_yes
0,42000.0,5850,3,1,2,1,1.0,0.0,1.0,0.0,0.0,0.0
1,38500.0,4000,2,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0
2,49500.0,3060,3,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0
3,60500.0,6650,3,1,2,0,1.0,1.0,0.0,0.0,0.0,0.0
4,61000.0,6360,2,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
541,91500.0,4800,3,2,4,0,1.0,1.0,0.0,0.0,1.0,0.0
542,94000.0,6000,3,2,4,0,1.0,0.0,0.0,0.0,1.0,0.0
543,103000.0,6000,3,2,4,1,1.0,1.0,0.0,0.0,1.0,0.0
544,105000.0,6000,3,2,2,1,1.0,1.0,0.0,0.0,1.0,0.0


In [146]:
# For subset of columns

df = housing[['driveway', 'airco', 'lotsize']].copy()

In [150]:
num_cols = df.columns[df.dtypes != object].tolist()
obj_cols = df.columns[df.dtypes == object].tolist()
ohe = OneHotEncoder(
    sparse_output= False,
    drop = 'first',
).set_output(transform = "pandas")



In [152]:
ct = make_column_transformer(('passthrough', num_cols), (ohe, obj_cols),
                             verbose_feature_names_out = False
                            ).set_output(transform = "pandas")
ct.fit_transform(df)

Unnamed: 0,lotsize,driveway_yes,airco_yes
0,5850,1.0,0.0
1,4000,1.0,0.0
2,3060,1.0,0.0
3,6650,1.0,0.0
4,6360,1.0,0.0
...,...,...,...
541,4800,1.0,1.0
542,6000,1.0,1.0
543,6000,1.0,1.0
544,6000,1.0,1.0


#### make_column_selector function

In [None]:
# For subset of columns

df = housing[['driveway', 'airco', 'lotsize']].copy()

In [None]:
ohe = OneHotEncoder(
    sparse_output= False,
    drop = 'first',
).set_output(transform = "pandas")

In [156]:
# using make_column_selector
ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude = object)), 
                             (ohe, make_column_selector(dtype_include = object)),
                             verbose_feature_names_out = False
                            ).set_output(transform = "pandas")
ct.fit_transform(df)

Unnamed: 0,lotsize,driveway_yes,airco_yes
0,5850,1.0,0.0
1,4000,1.0,0.0
2,3060,1.0,0.0
3,6650,1.0,0.0
4,6360,1.0,0.0
...,...,...,...
541,4800,1.0,1.0
542,6000,1.0,1.0
543,6000,1.0,1.0
544,6000,1.0,1.0


#### For whole dataset

In [179]:
ohe = OneHotEncoder(
    sparse_output= False,
    drop = 'first',
).set_output(transform = "pandas")

ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude = object)), 
                             (ohe, make_column_selector(dtype_include = object)),
                             verbose_feature_names_out = False
                            ).set_output(transform = "pandas")

X = housing.drop('price', axis = 'columns')
y = housing['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 24, test_size = 0.3)

In [181]:
X_ohe_trn = ct.fit_transform(X_train)
X_ohe_tst = ct.transform(X_test)
lr.fit(X_ohe_trn, y_train)
y_pred = lr.predict(X_ohe_tst)
r2_score(y_test, y_pred)

0.6246856191453717

#### Using Pipeline

In [190]:
from sklearn.pipeline import Pipeline

# For Polynomial features
poly = PolynomialFeatures(
    degree=2, 
    include_bias=False
).set_output(transform='pandas')

# One hot encoder object
ohe = OneHotEncoder(
    sparse_output= False,
    drop = 'first',
).set_output(transform = "pandas")

# Column Transformer for one hot encoding 
ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude = object)), 
                             (ohe, make_column_selector(dtype_include = object)),
                             verbose_feature_names_out = False
                            ).set_output(transform = "pandas")

# Linear regressor object
lr = LinearRegression()

pipe = Pipeline([("TRNS", ct), ("POLY", poly), ('LR', lr)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.5558314095911471