# Feature Selection Techniques

In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv("/content/50_Startups.csv")

In [6]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [8]:
finaldf = pd.concat([pd.get_dummies(df.State, dtype=int), df.iloc[:,[0,1,2,4]]], axis=1)

In [9]:
finaldf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   California       50 non-null     int64  
 1   Florida          50 non-null     int64  
 2   New York         50 non-null     int64  
 3   R&D Spend        50 non-null     float64
 4   Administration   50 non-null     float64
 5   Marketing Spend  50 non-null     float64
 6   Profit           50 non-null     float64
dtypes: float64(4), int64(3)
memory usage: 2.9 KB


In [10]:
finaldf.head()

Unnamed: 0,California,Florida,New York,R&D Spend,Administration,Marketing Spend,Profit
0,0,0,1,165349.2,136897.8,471784.1,192261.83
1,1,0,0,162597.7,151377.59,443898.53,191792.06
2,0,1,0,153441.51,101145.55,407934.54,191050.39
3,0,0,1,144372.41,118671.85,383199.62,182901.99
4,0,1,0,142107.34,91391.77,366168.42,166187.94


# 1. Correlation Analysis

In [11]:
finaldf.corr()

Unnamed: 0,California,Florida,New York,R&D Spend,Administration,Marketing Spend,Profit
California,1.0,-0.492366,-0.515152,-0.143165,-0.015478,-0.168875,-0.145837
Florida,-0.492366,1.0,-0.492366,0.105711,0.010493,0.205685,0.116244
New York,-0.515152,-0.492366,1.0,0.039068,0.005145,-0.03367,0.031368
R&D Spend,-0.143165,0.105711,0.039068,1.0,0.241955,0.724248,0.9729
Administration,-0.015478,0.010493,0.005145,0.241955,1.0,-0.032154,0.200717
Marketing Spend,-0.168875,0.205685,-0.03367,0.724248,-0.032154,1.0,0.747766
Profit,-0.145837,0.116244,0.031368,0.9729,0.200717,0.747766,1.0


# Backward Elimination using OLS

In [12]:
features= finaldf.iloc[:,[0,1,2,3,4,5]].values
labels = finaldf.iloc[:, [6]].values

In [16]:
features

array([[0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.6534920e+05,
        1.3689780e+05, 4.7178410e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.6259770e+05,
        1.5137759e+05, 4.4389853e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.5344151e+05,
        1.0114555e+05, 4.0793454e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.4437241e+05,
        1.1867185e+05, 3.8319962e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.4210734e+05,
        9.1391770e+04, 3.6616842e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.3187690e+05,
        9.9814710e+04, 3.6286136e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.3461546e+05,
        1.4719887e+05, 1.2771682e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.3029813e+05,
        1.4553006e+05, 3.2387668e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.2054252e+05,
        1.4871895e+05, 3.1161329e+05],
       [1.0000000e+00, 0.0000000e+00,

In [17]:
featureAllIn=np.append(np.ones((len(features), 1)).astype(int), features, axis=1)

In [18]:
featureAllIn

array([[1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00,
        1.6534920e+05, 1.3689780e+05, 4.7178410e+05],
       [1.0000000e+00, 1.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        1.6259770e+05, 1.5137759e+05, 4.4389853e+05],
       [1.0000000e+00, 0.0000000e+00, 1.0000000e+00, 0.0000000e+00,
        1.5344151e+05, 1.0114555e+05, 4.0793454e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00,
        1.4437241e+05, 1.1867185e+05, 3.8319962e+05],
       [1.0000000e+00, 0.0000000e+00, 1.0000000e+00, 0.0000000e+00,
        1.4210734e+05, 9.1391770e+04, 3.6616842e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00,
        1.3187690e+05, 9.9814710e+04, 3.6286136e+05],
       [1.0000000e+00, 1.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        1.3461546e+05, 1.4719887e+05, 1.2771682e+05],
       [1.0000000e+00, 0.0000000e+00, 1.0000000e+00, 0.0000000e+00,
        1.3029813e+05, 1.4553006e+05, 3.2387668e+05],
       [1.0000000e+00, 0

In [21]:
#SL (Significance Level) = 0.05
#SL = represents the probability of rejecting the null hypothesis when it is actually true (Type I error)
#CL (Confidence Level) = 95%
#CL refers to the probability that the confidence interval (CI) contains the true population parameter.

#CL is the complement of the Significance Level (SL):
#CL=1−SL

SL = 0.05

In [23]:
import statsmodels.regression.linear_model as stat

model = stat.OLS(endog= labels , exog=  featureAllIn ).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Sat, 08 Mar 2025",Prob (F-statistic):,1.34e-27
Time:,14:53:31,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.763e+04,5073.636,7.417,0.000,2.74e+04,4.79e+04
x1,1.249e+04,2449.797,5.099,0.000,7554.868,1.74e+04
x2,1.269e+04,2726.700,4.654,0.000,7195.596,1.82e+04
x3,1.245e+04,2486.364,5.007,0.000,7439.285,1.75e+04
x4,0.8060,0.046,17.369,0.000,0.712,0.900
x5,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x6,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1.81e+17


In [24]:
# Step 4: Select the feature that has the highest p-value
# We selected x5(admin) ====> p = 0.608

In [25]:
#Step 5: Check the below condn
# if pvalue > SL :
#         Eliminate Admin Col
#         Recreate new featureset


newFeatureCol = featureAllIn[:,[0,1,2,3,4,6]]

In [26]:
model = stat.OLS(endog= labels , exog=  newFeatureCol ).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,215.8
Date:,"Sat, 08 Mar 2025",Prob (F-statistic):,9.720000000000001e-29
Time:,14:54:23,Log-Likelihood:,-525.53
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1071.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.525e+04,2100.376,16.782,0.000,3.1e+04,3.95e+04
x1,1.171e+04,1910.312,6.130,0.000,7861.854,1.56e+04
x2,1.185e+04,2170.903,5.459,0.000,7477.785,1.62e+04
x3,1.169e+04,1988.428,5.879,0.000,7684.996,1.57e+04
x4,0.7967,0.042,18.771,0.000,0.711,0.882
x5,0.0298,0.016,1.842,0.072,-0.003,0.062

0,1,2,3
Omnibus:,14.64,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.037
Skew:,-0.938,Prob(JB):,2.7e-05
Kurtosis:,5.565,Cond. No.,1.46e+17


In [27]:
newFeatureCol = featureAllIn[:,[0,1,2,3,4]]
model = stat.OLS(endog= labels , exog=  newFeatureCol ).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.943
Method:,Least Squares,F-statistic:,272.4
Date:,"Sat, 08 Mar 2025",Prob (F-statistic):,2.76e-29
Time:,14:54:36,Log-Likelihood:,-527.35
No. Observations:,50,AIC:,1063.0
Df Residuals:,46,BIC:,1070.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.686e+04,1959.786,18.806,0.000,3.29e+04,4.08e+04
x1,1.189e+04,1956.677,6.079,0.000,7955.697,1.58e+04
x2,1.306e+04,2122.665,6.152,0.000,8785.448,1.73e+04
x3,1.19e+04,2036.022,5.847,0.000,7805.580,1.6e+04
x4,0.8530,0.030,28.226,0.000,0.792,0.914

0,1,2,3
Omnibus:,13.418,Durbin-Watson:,1.122
Prob(Omnibus):,0.001,Jarque-Bera (JB):,17.605
Skew:,-0.907,Prob(JB):,0.00015
Kurtosis:,5.271,Cond. No.,3.2e+17


In [29]:
model.pvalues

array([3.14565551e-23, 2.20580553e-07, 1.71425332e-07, 4.91366030e-07,
       1.07848040e-30])

In [28]:
#FinalFeature Set for Modelling : California, Florida, NY, RD
#FinalLAbel: Profit