In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('housings.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
df.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

In [5]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [6]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [6]:
 df.total_bedrooms.fillna(0)

0         129.0
1        1106.0
2         190.0
3         235.0
4         280.0
          ...  
20635     374.0
20636     150.0
20637     485.0
20638     409.0
20639     616.0
Name: total_bedrooms, Length: 20640, dtype: float64

In [7]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


# DATA PREPARATION

In [8]:
df1 = df.dropna()
df1.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [9]:
df2 = df1.copy()
df2['rooms_per_household'] = df2['total_rooms'] / df2['households']
df2['rooms_per_household']

0        6.984127
1        6.238137
2        8.288136
3        5.817352
4        6.281853
           ...   
20635    5.045455
20636    6.114035
20637    5.205543
20638    5.329513
20639    5.254717
Name: rooms_per_household, Length: 20433, dtype: float64

In [10]:
# df3 = df2.copy()
df2['bedrooms_per_room'] = df2['total_bedrooms'] / df2['total_rooms']
df2['bedrooms_per_room']

0        0.146591
1        0.155797
2        0.129516
3        0.184458
4        0.172096
           ...   
20635    0.224625
20636    0.215208
20637    0.215173
20638    0.219892
20639    0.221185
Name: bedrooms_per_room, Length: 20433, dtype: float64

In [11]:
df2.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity', 'rooms_per_household',
       'bedrooms_per_room'],
      dtype='object')

In [12]:
df2['population_per_household'] = df2['population'] / df2['households']
df2['population_per_household']

0        2.555556
1        2.109842
2        2.802260
3        2.547945
4        2.181467
           ...   
20635    2.560606
20636    3.122807
20637    2.325635
20638    2.123209
20639    2.616981
Name: population_per_household, Length: 20433, dtype: float64

In [13]:
df2.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity', 'rooms_per_household',
       'bedrooms_per_room', 'population_per_household'],
      dtype='object')

QUESTION 1

In [14]:
df2.ocean_proximity.mode()

0    <1H OCEAN
Name: ocean_proximity, dtype: object

# Splitting the data

In [16]:
cat = ['ocean_proximity']

In [17]:
df2.corr()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household
longitude,1.0,-0.924616,-0.109357,0.04548,0.069608,0.10027,0.056513,-0.01555,-0.045398,-0.027307,0.092657,0.002304
latitude,-0.924616,1.0,0.011899,-0.036667,-0.066983,-0.108997,-0.071774,-0.079626,-0.144638,0.106423,-0.113815,0.002522
housing_median_age,-0.109357,0.011899,1.0,-0.360628,-0.320451,-0.295787,-0.302768,-0.118278,0.106432,-0.153031,0.136089,0.013258
total_rooms,0.04548,-0.036667,-0.360628,1.0,0.93038,0.857281,0.918992,0.197882,0.133294,0.133482,-0.1879,-0.024596
total_bedrooms,0.069608,-0.066983,-0.320451,0.93038,1.0,0.877747,0.979728,-0.007723,0.049686,0.001538,0.084238,-0.028355
population,0.10027,-0.108997,-0.295787,0.857281,0.877747,1.0,0.907186,0.005087,-0.0253,-0.071898,0.035319,0.070062
households,0.056513,-0.071774,-0.302768,0.918992,0.979728,0.907186,1.0,0.013434,0.064894,-0.080165,0.065087,-0.027336
median_income,-0.01555,-0.079626,-0.118278,0.197882,-0.007723,0.005087,0.013434,1.0,0.688355,0.325307,-0.615661,0.018894
median_house_value,-0.045398,-0.144638,0.106432,0.133294,0.049686,-0.0253,0.064894,0.688355,1.0,0.151344,-0.25588,-0.023639
rooms_per_household,-0.027307,0.106423,-0.153031,0.133482,0.001538,-0.071898,-0.080165,0.325307,0.151344,1.0,-0.416952,-0.004873


In [18]:
df2.corr().unstack().sort_values(ascending = False)

longitude            longitude              1.000000
latitude             latitude               1.000000
bedrooms_per_room    bedrooms_per_room      1.000000
rooms_per_household  rooms_per_household    1.000000
median_house_value   median_house_value     1.000000
                                              ...   
bedrooms_per_room    rooms_per_household   -0.416952
                     median_income         -0.615661
median_income        bedrooms_per_room     -0.615661
longitude            latitude              -0.924616
latitude             longitude             -0.924616
Length: 144, dtype: float64

# Make median_house_value binary

In [19]:
data_class = df2.copy()
mean = data_class['median_house_value'].mean()

data_class['above_average'] = np.where(data_class['median_house_value']>=mean,1,0)

In [41]:
data_class = data_class.drop('median_house_value')

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
df_train_full, df_test = train_test_split(data_class, test_size=0.2, random_state=42)

In [23]:
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [24]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [25]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

In [36]:
df_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-117.96,33.86,35.0,2146.0,430.0,1230.0,429.0,3.7813,<1H OCEAN,5.002331,0.200373,2.867133
1,-119.50,35.27,23.0,3827.0,696.0,1993.0,617.0,3.0742,INLAND,6.202593,0.181866,3.230146
2,-117.04,32.71,28.0,5274.0,991.0,3727.0,961.0,3.5700,NEAR OCEAN,5.488033,0.187903,3.878252
3,-116.41,33.74,17.0,4289.0,893.0,958.0,440.0,2.4659,INLAND,9.747727,0.208207,2.177273
4,-121.99,37.25,25.0,1743.0,212.0,604.0,200.0,10.7582,<1H OCEAN,8.715000,0.121629,3.020000
...,...,...,...,...,...,...,...,...,...,...,...,...
12254,-118.40,34.11,32.0,5578.0,753.0,1567.0,697.0,15.0001,<1H OCEAN,8.002869,0.134995,2.248207
12255,-117.97,34.04,28.0,1686.0,417.0,1355.0,388.0,2.5192,<1H OCEAN,4.345361,0.247331,3.492268
12256,-117.09,32.76,44.0,1139.0,214.0,470.0,217.0,3.5481,NEAR OCEAN,5.248848,0.187884,2.165899
12257,-118.16,34.19,42.0,2076.0,462.0,1641.0,436.0,2.2326,<1H OCEAN,4.761468,0.222543,3.763761


# Mutual information

In [26]:
from sklearn.metrics import mutual_info_score

In [27]:
def calculate_mi(series):
    return mutual_info_score(series, df_train.above_average)

df_mi = df_train[cat].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')

In [28]:
df_mi

Unnamed: 0,MI
ocean_proximity,0.100126


In [29]:
df_train = df_train.drop('above_average', axis=1)
df_val = df_val.drop('above_average', axis=1)
df_test = df_test.drop('above_average', axis=1)

# logistic regression

In [42]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [43]:
numerical = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income', 
'rooms_per_household',
'bedrooms_per_room',
'population_per_household']

In [44]:
train_dict = df_train[cat + numerical].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

x_train = dv.transform(train_dict)

In [48]:
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
model.fit(x_train, y_train)

val_dict = df_val[cat + numerical].to_dict(orient='records')
x_val = dv.transform(val_dict)

y_pred = model.predict(x_val)

accuracy = np.round(accuracy_score(y_val, y_pred),2)
print(accuracy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8


# feature elimination technique

In [49]:
features = cat + numerical
features

['ocean_proximity',
 'latitude',
 'longitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'rooms_per_household',
 'bedrooms_per_room',
 'population_per_household']

In [54]:
import warnings
warnings.filterwarnings("ignore")


orig_score = accuracy


for c in features:
    subset = features.copy()
    subset.remove(c)
    
    train_dict = df_train[subset].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)

    x_train = dv.transform(train_dict)

    model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
    model.fit(x_train, y_train)

    val_dict = df_val[subset].to_dict(orient='records')
    x_val = dv.transform(val_dict)

    y_pred = model.predict(x_val)

    score = accuracy_score(y_val, y_pred)
    print(c, orig_score - score, score)

ocean_proximity -0.018204061658918524 0.8182040616589186
latitude -0.02040616589185218 0.8204061658918522
longitude -0.014533887937362322 0.8145338879373624
housing_median_age -0.022363591876682132 0.8223635918766822
total_rooms -0.024321017861512084 0.8243210178615121
total_bedrooms -0.024565696109615787 0.8245656961096158
population 0.0011255199412772976 0.7988744800587227
households -0.027012478590653255 0.8270124785906533
median_income 0.01702960606802062 0.7829703939319794
rooms_per_household -0.027991191583068176 0.8279911915830682
bedrooms_per_room 0.002593589429899734 0.7974064105701003
population_per_household -0.026278443846342037 0.8262784438463421


# ridge regression

In [63]:
df2['median_house_value']=np.log1p(df2['median_house_value'])

In [64]:
df_train_full, df_test = train_test_split(df2, test_size=0.2, random_state=42)

In [65]:
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [66]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [67]:
y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values

In [68]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [69]:
train_dict = df_train[cat + numerical].to_dict(orient='records')

In [70]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

x_train = dv.transform(train_dict)

val_dict = df_val[cat + numerical].to_dict(orient='records')
x_val = dv.transform(val_dict)

In [71]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [72]:
for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a,random_state=42)
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_val)
    
    score = np.sqrt(mean_squared_error(y_val, y_pred))
    
    print(a, round(score, 3))

0 0.321
0.01 0.321
0.1 0.321
1 0.321
10 0.321
