In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.impute import SimpleImputer

In [None]:
df = pd.read_csv('/content/housing.csv')

In [None]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
df.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

In [None]:
df.corr() 
# correlation between the data
# returns correlation matrix

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
longitude,1.0,-0.930435,-0.081186,0.035471,0.065559,0.099139,0.054572,0.037959,0.078021
latitude,-0.930435,1.0,-0.031497,-0.013397,-0.052162,-0.101685,-0.062405,-0.10518,-0.226672
housing_median_age,-0.081186,-0.031497,1.0,-0.38005,-0.346868,-0.312717,-0.326337,-0.132122,0.067785
total_rooms,0.035471,-0.013397,-0.38005,1.0,0.929554,0.859815,0.915375,0.212847,0.140808
total_bedrooms,0.065559,-0.052162,-0.346868,0.929554,1.0,0.884828,0.977903,0.004213,0.053898
population,0.099139,-0.101685,-0.312717,0.859815,0.884828,1.0,0.916217,0.008246,-0.021711
households,0.054572,-0.062405,-0.326337,0.915375,0.977903,0.916217,1.0,0.024339,0.06984
median_income,0.037959,-0.10518,-0.132122,0.212847,0.004213,0.008246,0.024339,1.0,0.696207
median_house_value,0.078021,-0.226672,0.067785,0.140808,0.053898,-0.021711,0.06984,0.696207,1.0


In [None]:
df['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [None]:
lbl = LabelEncoder()
lbl.fit_transform(df['ocean_proximity'])

array([3, 3, 3, ..., 1, 1, 1])

In [None]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [None]:
smi = SimpleImputer(strategy='mean', missing_values=np.nan)
df['total_bedrooms'] = smi.fit_transform(df[['total_bedrooms']])

In [None]:
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [None]:
df.drop('ocean_proximity', axis='columns', inplace=True)
# inplace is true as we want to drop the column in the current df and not a copy

In [None]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [None]:
X = df.iloc[: ,:-1]
y = df.iloc[: , -1]

In [None]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.3, random_state=42)

In [None]:
print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)

X_train shape: (14448, 8)
y_train shape: (14448,)


In [None]:
regressor = MLPRegressor(hidden_layer_sizes=(100,50), activation='relu', solver='lbfgs')
regressor

MLPRegressor(hidden_layer_sizes=(100, 50), solver='lbfgs')

In [None]:
regressor.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPRegressor(hidden_layer_sizes=(100, 50), solver='lbfgs')

In [None]:
print(regressor.score(X_test,y_test)*100)

78.2286148451364
