# Agenda
1. Polynominal Regression = Polyniminal Feature + Linear Regression
2. Pipoline
3. Column Transformer
4. One Hot Encoding

In [None]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression 
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import PolynomialFeatures


In [23]:
concrete_compressive_strength = fetch_ucirepo(id=165) 

In [24]:
# data (as pandas dataframes) 
X = concrete_compressive_strength.data.features 
y = concrete_compressive_strength.data.targets 
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=24, test_size=0.3)


In [25]:
# Degree 1

In [26]:
lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred= lr.predict(X_test)
r2_score(y_test,y_pred)

0.5771752777048791

In [27]:
# Degree 2

In [28]:
poly = PolynomialFeatures(degree=2,include_bias=False).set_output(transform='pandas')
X_poly_fit_train  = poly.fit_transform(X_train)    
X_poly_fit_test = poly.transform(X_test)
lr.fit(X_poly_fit_train,y_train)
y_pred = lr.predict(X_poly_fit_test)
r2_score(y_test,y_pred)

0.7807023265195984

In [29]:
# Degree 3

In [30]:
poly = PolynomialFeatures(degree=3,include_bias=False).set_output(transform='pandas')
X_poly_fit_train  = poly.fit_transform(X_train)    
X_poly_fit_test = poly.transform(X_test)
lr.fit(X_poly_fit_train,y_train)
y_pred = lr.predict(X_poly_fit_test)
r2_score(y_test,y_pred)

0.8622817297882757

In [31]:
# Degree 4

In [32]:
poly = PolynomialFeatures(degree=4,include_bias=False).set_output(transform='pandas')
X_poly_fit_train  = poly.fit_transform(X_train)    
X_poly_fit_test = poly.transform(X_test)
lr.fit(X_poly_fit_train,y_train)
y_pred = lr.predict(X_poly_fit_test)
r2_score(y_test,y_pred)

-128.64221126252144

# Using Pipelines 

In [38]:
from sklearn.pipeline import Pipeline
poly = PolynomialFeatures(degree=3,include_bias=False).set_output(transform='pandas')
lr = LinearRegression()
pipe = Pipeline([('POLY',poly),('LR',lr)])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
r2_score(y_test,y_pred)

0.8622817297882757

In [34]:
# PRARAMETERS : list of tuple = ('Name of staps',estimation )

# Housing Dataset

In [5]:
housing = pd.read_csv('Housing.csv')
housing.head()

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea
0,42000.0,5850,3,1,2,yes,no,yes,no,no,1,no
1,38500.0,4000,2,1,1,yes,no,no,no,no,0,no
2,49500.0,3060,3,1,1,yes,no,no,no,no,0,no
3,60500.0,6650,3,1,2,yes,yes,no,no,no,0,no
4,61000.0,6360,2,1,1,yes,no,no,no,no,0,no


In [14]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 546 entries, 0 to 545
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   price     546 non-null    float64
 1   lotsize   546 non-null    int64  
 2   bedrooms  546 non-null    int64  
 3   bathrms   546 non-null    int64  
 4   stories   546 non-null    int64  
 5   driveway  546 non-null    object 
 6   recroom   546 non-null    object 
 7   fullbase  546 non-null    object 
 8   gashw     546 non-null    object 
 9   airco     546 non-null    object 
 10  garagepl  546 non-null    int64  
 11  prefarea  546 non-null    object 
dtypes: float64(1), int64(5), object(6)
memory usage: 51.3+ KB


In [18]:
X = housing[['driveway']] 
y = housing['price']

# One Hot Encoding

In [26]:
# .getdummies()  from pandas

In [33]:
dum_X = pd.get_dummies(X)
print(dum_X['driveway_no'].value_counts())
print(dum_X['driveway_yes'].value_counts())

driveway_no
False    469
True      77
Name: count, dtype: int64
driveway_yes
True     469
False     77
Name: count, dtype: int64


In [29]:
housing.shape

(546, 12)

In [34]:
dum_X = pd.get_dummies(X,drop_first=True)   # (drop_first = True) to drop first column OR to reduce redudancy.
dum_X

Unnamed: 0,driveway_yes
0,True
1,True
2,True
3,True
4,True
...,...
541,True
542,True
543,True
544,True


In [35]:
lr = LinearRegression()
lr.fit(dum_X,y)
lr.intercept_,lr.coef_

(48555.77922077922, array([22778.11630161]))

In [42]:
# Price = 48555.77 + 22778.11 * driveway_yes      So, That is 71333.88
# if driveway = yes, price is :  71333.88
# if driveway = No, price is : 48555.77

In [43]:
housing.groupby('driveway')['price'].mean()

driveway
no     48555.779221
yes    71333.895522
Name: price, dtype: float64

In [48]:
# Now using 'driveway_no'

In [None]:
# Price = 71333.88 - 22778.11 * driveway_no      So, That is 48555.77
# if driveway = yes, price is :  71333.88
# if driveway = No, price is :  48555.77

In [52]:
dum_X = pd.get_dummies(X)
dum_X.drop('driveway_yes',axis=1,inplace =True)
dum_X.head()

Unnamed: 0,driveway_no
0,False
1,False
2,False
3,False
4,False


In [53]:
lr = LinearRegression()
lr.fit(dum_X,y)
lr.intercept_,lr.coef_

(71333.89552238806, array([-22778.11630161]))

In [54]:
housing.groupby('driveway')['price'].mean()

driveway
no     48555.779221
yes    71333.895522
Name: price, dtype: float64

# Another Method of OneHotEncodiing usng Sklearn
**OneHotEncoder()**

In [66]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=False,drop='first').set_output(transform='pandas')     
# (drop = 'first') : from dropping fiurst column 
#  [.set_output(transform='pandas')] : Output format is alwayas in numpy so to transform in pandas
# (sparse_output=False) : sparse by default is true ( So it store in compressed format i.e; SCR)

In [67]:
dum_X = ohe.fit_transform(X)
lr = LinearRegression()
lr.fit(dum_X,y)
lr.intercept_,lr.coef_

(48555.77922077922, array([22778.11630161]))

In [68]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 546 entries, 0 to 545
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   price     546 non-null    float64
 1   lotsize   546 non-null    int64  
 2   bedrooms  546 non-null    int64  
 3   bathrms   546 non-null    int64  
 4   stories   546 non-null    int64  
 5   driveway  546 non-null    object 
 6   recroom   546 non-null    object 
 7   fullbase  546 non-null    object 
 8   gashw     546 non-null    object 
 9   airco     546 non-null    object 
 10  garagepl  546 non-null    int64  
 11  prefarea  546 non-null    object 
dtypes: float64(1), int64(5), object(6)
memory usage: 51.3+ KB


Two Columns **driveway** and **airco**

In [70]:
X = housing[['driveway','airco']]
y = housing['price']
dum_X = ohe.fit_transform(X)
lr = LinearRegression()
lr.fit(dum_X,y)
lr.intercept_,lr.coef_

(43790.67032031879, array([19302.6687155, 24460.8923557]))

In [71]:
dum_X.columns

Index(['driveway_yes', 'airco_yes'], dtype='object')

Now wth **numeric** column

In [74]:
housing['bedrooms'].unique()

array([3, 2, 4, 1, 5, 6], dtype=int64)

In [79]:
dum_X = pd.get_dummies(X,drop_first=True)
dum_X.columns

Index(['bedrooms', 'driveway_yes', 'airco_yes'], dtype='object')

In [81]:
# pd.get_dummies()  : does the hot encoding of only object type variables

In [101]:
# X = housing[['driveway','airco','lotsize']]
# y = housing['price']
# dum_X = ohe.fit_transform(X)
# lr = LinearRegression()
# lr.fit(dum_X,y)
# lr.intercept_,lr.coef_

#It peform OneHotEncoding on numeric column so that we dont want : for that we use [Column Ttransformer]

In [102]:
dum_X.columns

Index(['driveway_yes', 'airco_yes', 'lotsize_1700', 'lotsize_1836',
       'lotsize_1905', 'lotsize_1950', 'lotsize_2000', 'lotsize_2015',
       'lotsize_2135', 'lotsize_2145',
       ...
       'lotsize_11175', 'lotsize_11410', 'lotsize_11440', 'lotsize_11460',
       'lotsize_12090', 'lotsize_12900', 'lotsize_12944', 'lotsize_13200',
       'lotsize_15600', 'lotsize_16200'],
      dtype='object', length=285)

# **Column Transforms**

In [86]:
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

In [91]:
str_col = (housing.columns[housing.dtypes == 'object'])    # Give column which contain Object dtype

In [92]:
num_col = (housing.columns[housing.dtypes != 'object'])     # Give column which doesnot contain Object dtype

In [93]:
ohe = OneHotEncoder(sparse_output=False,drop='first').set_output(transform='pandas')     

In [96]:
ct = make_column_transformer(('passthrough',num_col),(ohe,str_col),verbose_feature_names_out=False).set_output(transform='pandas')
ct.fit_transform(housing)

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,garagepl,driveway_yes,recroom_yes,fullbase_yes,gashw_yes,airco_yes,prefarea_yes
0,42000.0,5850,3,1,2,1,1.0,0.0,1.0,0.0,0.0,0.0
1,38500.0,4000,2,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0
2,49500.0,3060,3,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0
3,60500.0,6650,3,1,2,0,1.0,1.0,0.0,0.0,0.0,0.0
4,61000.0,6360,2,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
541,91500.0,4800,3,2,4,0,1.0,1.0,0.0,0.0,1.0,0.0
542,94000.0,6000,3,2,4,0,1.0,0.0,0.0,0.0,1.0,0.0
543,103000.0,6000,3,2,4,1,1.0,1.0,0.0,0.0,1.0,0.0
544,105000.0,6000,3,2,2,1,1.0,1.0,0.0,0.0,1.0,0.0


In [121]:
# Now Perform on abow multiple columns which aslo contain numeric column

In [127]:
df = housing[['driveway','airco','lotsize']].copy()
ohe = OneHotEncoder(sparse_output=False,drop='first').set_output(transform='pandas')      
ct = make_column_transformer(('passthrough',make_column_selector(dtype_exclude=object)),
                             (ohe,make_column_selector(dtype_include=object)),
                             verbose_feature_names_out=False).set_output(transform='pandas')
ct.fit_transform(housing)

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,garagepl,driveway_yes,recroom_yes,fullbase_yes,gashw_yes,airco_yes,prefarea_yes
0,42000.0,5850,3,1,2,1,1.0,0.0,1.0,0.0,0.0,0.0
1,38500.0,4000,2,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0
2,49500.0,3060,3,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0
3,60500.0,6650,3,1,2,0,1.0,1.0,0.0,0.0,0.0,0.0
4,61000.0,6360,2,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
541,91500.0,4800,3,2,4,0,1.0,1.0,0.0,0.0,1.0,0.0
542,94000.0,6000,3,2,4,0,1.0,0.0,0.0,0.0,1.0,0.0
543,103000.0,6000,3,2,4,1,1.0,1.0,0.0,0.0,1.0,0.0
544,105000.0,6000,3,2,2,1,1.0,1.0,0.0,0.0,1.0,0.0


In [122]:
# Using mkae_column_selector()

In [124]:
ohe = OneHotEncoder(sparse_output=False,drop='first').set_output(transform='pandas')      
ct = make_column_transformer(('passthrough',make_column_selector(dtype_exclude=object)),
                             (ohe,make_column_selector(dtype_include=object)),
                             verbose_feature_names_out=False).set_output(transform='pandas')
ct.fit_transform(housing)

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,garagepl,driveway_yes,recroom_yes,fullbase_yes,gashw_yes,airco_yes,prefarea_yes
0,42000.0,5850,3,1,2,1,1.0,0.0,1.0,0.0,0.0,0.0
1,38500.0,4000,2,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0
2,49500.0,3060,3,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0
3,60500.0,6650,3,1,2,0,1.0,1.0,0.0,0.0,0.0,0.0
4,61000.0,6360,2,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
541,91500.0,4800,3,2,4,0,1.0,1.0,0.0,0.0,1.0,0.0
542,94000.0,6000,3,2,4,0,1.0,0.0,0.0,0.0,1.0,0.0
543,103000.0,6000,3,2,4,1,1.0,1.0,0.0,0.0,1.0,0.0
544,105000.0,6000,3,2,2,1,1.0,1.0,0.0,0.0,1.0,0.0


# One Hot Encoder Using Train Test Split 

In [131]:
ohe = OneHotEncoder(sparse_output=False,drop='first').set_output(transform='pandas')      
ct = make_column_transformer(('passthrough',make_column_selector(dtype_exclude=object)),
                             (ohe,make_column_selector(dtype_include=object)),
                             verbose_feature_names_out=False).set_output(transform='pandas')
X= housing.drop('price',axis=1)
y = housing['price']
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=24,test_size=0.3)

In [132]:
X_ohe_trn = ct.fit_transform(X_train)
X_ohe_tst = ct.transform(X_test)
lr.fit(X_ohe_trn,y_train)
y_pred = lr.predict(X_ohe_tst)
r2_score(y_test,y_pred)


0.6246856191453717

# using Pipeline()

In [143]:
from sklearn.pipeline import Pipeline
lr = LinearRegression()
pipe = Pipeline([('trfn',ct),('LR',lr)])
pipe.fit(X_ohe_trn,y_train)

In [146]:
y_pred = pipe.predict(X_ohe_tst)
r2_score(y_test,y_pred)

0.6246856191453717

# OHE & Polynominal Transformatiion

In [155]:
poly = PolynomialFeatures(degree=1,include_bias=False).set_output(transform='pandas')
pipe = Pipeline([('TRNS',ct),('POLY',poly),('LR',lr)])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
r2_score(y_test,y_pred)

0.6246856191453718