In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

In [None]:
data = pd.read_csv('housing.csv')
data

In [None]:
data.info()

In [None]:
data.dropna(inplace=True)
data.info()

In [None]:
from sklearn.model_selection import train_test_split

X = data.drop(['median_house_value'], axis=1)
y = data['median_house_value']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
train_data = X_train.join(y_train)
train_data

In [None]:
train_data.hist()

In [None]:
train_data.drop(['ocean_proximity'], axis=1).corr()

In [None]:
sns.heatmap(train_data.drop(['ocean_proximity'], axis=1).corr(), annot=True)

In [None]:
train_data['total_rooms'] = np.log(train_data['total_rooms'] + 1)
train_data['total_bedrooms'] = np.log(train_data['total_bedrooms'] + 1)
train_data['population'] = np.log(train_data['population'] + 1)
train_data['households'] = np.log(train_data['households'] + 1)

In [None]:
train_data.hist()

In [None]:
train_data = train_data.join(pd.get_dummies(train_data.ocean_proximity)).drop(['ocean_proximity'], axis=1)

In [None]:
plt.figure(figsize=(15, 8))
sns.heatmap(train_data.corr(), annot=True)

In [None]:
train_data['bedroom_ratio'] = train_data['total_bedrooms'] / train_data['total_rooms']
train_data['household_rooms'] = train_data['total_rooms'] / train_data['households']

X_train = train_data.drop(['median_house_value'], axis=1)
y_train = train_data['median_house_value']

In [None]:
test_data = X_test.join(y_test)

test_data['total_rooms'] = np.log(test_data['total_rooms'] + 1)
test_data['total_bedrooms'] = np.log(test_data['total_bedrooms'] + 1)
test_data['population'] = np.log(test_data['population'] + 1)
test_data['households'] = np.log(test_data['households'] + 1)

test_data = test_data.join(pd.get_dummies(test_data.ocean_proximity)).drop(['ocean_proximity'], axis=1)

test_data['bedroom_ratio'] = test_data['total_bedrooms'] / test_data['total_rooms']
test_data['household_rooms'] = test_data['total_rooms'] / test_data['households']

X_test = test_data.drop(['median_house_value'], axis=1)
y_test = test_data['median_house_value']

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

scalar = StandardScaler()

X_train_s = scalar.fit_transform(X_train)

forest = RandomForestRegressor()

forest.fit(X_train_s, y_train)

In [None]:
test_data

In [None]:
X_test_s = scalar.transform(X_test)

In [None]:
forest.score(X_test_s, y_test)

In [None]:
forest.predict([[-122.46,37.79,52.0,2.054431,	1.718240,	1.905135,	1.742137,	14.2959,	0,	0,	0,	1,	0,	0.672515,	1.444389]])

In [None]:
y_test

In [None]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(forest, f)

In [None]:
X_test = test_data.drop(['median_house_value'], axis=1)

X_test['<1H OCEAN'] = X_test['<1H OCEAN'].astype(int)
X_test['INLAND'] = X_test['INLAND'].astype(int)
X_test['ISLAND'] = X_test['ISLAND'].astype(int)
X_test['NEAR BAY'] = X_test['NEAR BAY'].astype(int)
X_test['NEAR OCEAN'] = X_test['NEAR OCEAN'].astype(int)

X_test

In [None]:
import json
import csv
# X_test.to_csv(f'preprocessed(something).csv', index=False, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
X_test.to_json(f'preprocessed(something).jsonl', orient='values')



In [None]:
import json
with open('preprocessed(something).jsonl', 'r') as f:
    data = json.load(f)

with open('preprocessed_normal.jsonl', 'w') as f:
    for row in data[:20]:
        json.dump(row, f)
        f.write('\n')

In [None]:
X_test