In [1]:
import pandas as pd

df = pd.read_csv("home_prices.csv")
df.head()

Unnamed: 0,area_sqr_ft,bedrooms,color,price_lakhs
0,3774,2,Red,216
1,1460,3,Gray,88
2,1894,4,Gray,147
3,1730,2,Blue,84
4,1695,1,Blue,77


In [3]:
df["color"].unique()            # color is nominal hence using one hot encoding

array(['Red', 'Gray', 'Blue', 'Yellow', 'Green', 'White'], dtype=object)

# One Hot Encoding

In [6]:
df = pd.get_dummies(df, columns=['color'], drop_first=True)
df.head()

Unnamed: 0,area_sqr_ft,bedrooms,price_lakhs,color_Gray,color_Green,color_Red,color_White,color_Yellow
0,3774,2,216,False,False,True,False,False
1,1460,3,88,True,False,False,False,False
2,1894,4,147,True,False,False,False,False
3,1730,2,84,False,False,False,False,False
4,1695,1,77,False,False,False,False,False


In [8]:
cm = df.corr()
cm

Unnamed: 0,area_sqr_ft,bedrooms,price_lakhs,color_Gray,color_Green,color_Red,color_White,color_Yellow
area_sqr_ft,1.0,0.18581,0.945365,-0.068944,-0.032012,0.059055,0.063827,-0.037819
bedrooms,0.18581,1.0,0.439445,0.040882,-0.120207,-0.004177,-0.023676,0.015286
price_lakhs,0.945365,0.439445,1.0,-0.040565,-0.041959,0.045803,0.051122,-0.046673
color_Gray,-0.068944,0.040882,-0.040565,1.0,-0.214409,-0.23099,-0.205931,-0.217205
color_Green,-0.032012,-0.120207,-0.041959,-0.214409,1.0,-0.190117,-0.169493,-0.178771
color_Red,0.059055,-0.004177,0.045803,-0.23099,-0.190117,1.0,-0.1826,-0.192596
color_White,0.063827,-0.023676,0.051122,-0.205931,-0.169493,-0.1826,1.0,-0.171703
color_Yellow,-0.037819,0.015286,-0.046673,-0.217205,-0.178771,-0.192596,-0.171703,1.0


In [10]:
cm["price_lakhs"]

area_sqr_ft     0.945365
bedrooms        0.439445
price_lakhs     1.000000
color_Gray     -0.040565
color_Green    -0.041959
color_Red       0.045803
color_White     0.051122
color_Yellow   -0.046673
Name: price_lakhs, dtype: float64

In [12]:
cm_price = abs(cm["price_lakhs"])
cm_price

area_sqr_ft     0.945365
bedrooms        0.439445
price_lakhs     1.000000
color_Gray      0.040565
color_Green     0.041959
color_Red       0.045803
color_White     0.051122
color_Yellow    0.046673
Name: price_lakhs, dtype: float64

In [16]:
cm_price[cm_price > 0.2].index

Index(['area_sqr_ft', 'bedrooms', 'price_lakhs'], dtype='object')

In [18]:
selected_features = cm_price[cm_price > 0.2].index.drop('price_lakhs')
selected_features

Index(['area_sqr_ft', 'bedrooms'], dtype='object')

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

X = df[selected_features]
y = df["price_lakhs"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("r2   ",r2)
print("MSE   ",mse)

r2    0.9689466488379601
MSE    76.6333219827881


for non linear relationships , one should not use correlation for feature selection

In the presence of outliers, one should not use correlation for feature selection

For categorical variables, correlation should not be used for feature selection     

Understand r/p b/w causation vs correlation