# Importing packages

In [182]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, mean_squared_error
from sklearn.linear_model import LogisticRegression, Ridge

# Importing data

In [149]:
df=pd.read_csv("data/classification/housing.csv")

# Selecting variables

In [150]:
df=df[['latitude',"longitude","housing_median_age","total_rooms","total_bedrooms","population","households","median_income","median_house_value","ocean_proximity"]]

# Data Preparation

In [151]:
df=df.fillna(0)

In [152]:
df["rooms_per_household"]=df["total_rooms"]/df["households"]

In [153]:
df["bedrooms_per_room"]=df["total_bedrooms"]/df["total_rooms"]

In [154]:
df["population_per_household"]=df["population"]/df["households"]

# Q1 : the most frequent observation for the column ocean_proximity

In [155]:
df['ocean_proximity'].mode()

0    <1H OCEAN
dtype: object

# Q2 : correlation matrix

In [156]:
df.select_dtypes(include=np.number).corr()[df.select_dtypes(include=np.number).corr()<1].max()

latitude                    0.106389
longitude                   0.099773
housing_median_age          0.125396
total_rooms                 0.920196
total_bedrooms              0.966507
population                  0.907222
households                  0.966507
median_income               0.688075
median_house_value          0.688075
rooms_per_household         0.326895
bedrooms_per_room           0.125396
population_per_household    0.069863
dtype: float64

the two features that have the biggest correlation in this dataset : "total_bedrooms" and "households"

# Q3 : Mutual information between two categorical variables

In [158]:
df.loc[df['median_house_value']>df['median_house_value'].mean(),"above_average"]=1
df.loc[df['median_house_value']<df['median_house_value'].mean(),"above_average"]=0

In [159]:
df1=df.drop("median_house_value",1)

In [160]:
X=df1.drop("above_average",1)
Y=df1["above_average"]

In [161]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [162]:
X.shape, X_train.shape, X_val.shape, X_test.shape

((20640, 12), (12384, 12), (4128, 12), (4128, 12))

In [163]:
df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household,above_average
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556,1.0
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842,1.0
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226,1.0
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945,1.0
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467,1.0


In [164]:
round(mutual_info_score(y_train,X_train['ocean_proximity']),2)

0.1

# Q4 : Logistic Regression

In [165]:
X_train=pd.get_dummies(X_train)
X_val=pd.get_dummies(X_val)
X_test=pd.get_dummies(X_test)

In [166]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

In [167]:
model.fit(X_train,y_train)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [168]:
acc_mod=model.score(X_val,y_val)
print(round(acc_mod,2))

0.84


# Q5 : feature elimination technique 

In [170]:
acc_var=pd.DataFrame({"Var":X_train.columns.to_list(),'acc':[np.nan]*len(X_train.columns.to_list()),'diff':[np.nan]*len(X_train.columns.to_list())})
for i in range(len(X_train.columns.to_list())):
    var_name=X_train.columns.to_list()[i]
    model.fit(X_train.drop(var_name,1),y_train)
    acc_var.loc[i,'acc']=model.score(X_val.drop(var_name,1),y_val)
    acc_var.loc[i,'diff']=acc_mod-acc_var.loc[i,'acc']
    

In [171]:
acc_var['diff']=np.abs(acc_var['diff'])

In [172]:
acc_var.sort_values('diff')

Unnamed: 0,Var,acc,diff
11,ocean_proximity_<1H OCEAN,0.835756,0.0
4,total_bedrooms,0.835514,0.000242
8,rooms_per_household,0.835271,0.000484
9,bedrooms_per_room,0.83624,0.000484
15,ocean_proximity_NEAR OCEAN,0.835271,0.000484
14,ocean_proximity_NEAR BAY,0.836483,0.000727
0,latitude,0.834787,0.000969
13,ocean_proximity_ISLAND,0.836725,0.000969
3,total_rooms,0.837209,0.001453
10,population_per_household,0.837694,0.001938


the variable is total_bedrooms

# Q6 : Linear Regression

In [175]:
df1=df.drop("above_average",1)

In [176]:
df1.columns

Index(['latitude', 'longitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity', 'rooms_per_household',
       'bedrooms_per_room', 'population_per_household'],
      dtype='object')

In [177]:
X=df1.drop("median_house_value",1)
Y=df1["median_house_value"]

In [178]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [179]:
X.shape, X_train.shape, X_val.shape, X_test.shape

((20640, 12), (12384, 12), (4128, 12), (4128, 12))

In [180]:
df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household,above_average
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556,1.0
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842,1.0
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226,1.0
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945,1.0
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467,1.0


In [181]:
X_train=pd.get_dummies(X_train)
X_val=pd.get_dummies(X_val)
X_test=pd.get_dummies(X_test)

In [185]:
alpha_list=[0, 0.01, 0.1, 1, 10]
rmse_alpha=pd.DataFrame({"alpha":alpha_list,'RMSE':[np.nan]*len(alpha_list)})
for i in range(len(alpha_list)):
    model = Ridge(alpha=alpha_list[i], solver="sag", random_state=42)
    model.fit(X_train,y_train)
    rmse_alpha.loc[i,'RMSE']=mean_squared_error(y_val,model.predict(X_val))

In [186]:
rmse_alpha

Unnamed: 0,alpha,RMSE
0,0.0,11307720000.0
1,0.01,11307720000.0
2,0.1,11307720000.0
3,1.0,11307720000.0
4,10.0,11307720000.0
