In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [None]:
housing = pd.read_csv('housing.csv')
print(housing.info())

In [None]:
#DATA VISUALIZATION
housing.hist(bins = 50,figsize=(10,10))

In [None]:
from sklearn.model_selection import train_test_split
test_size = 0.20
train_data, test_data = train_test_split(housing, test_size=test_size)
print(train_data.shape)
print(test_data.shape)

In [None]:
housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[1, 2, 3, 4, 5]
)

housing['income_cat'].hist()
plt.show()


In [None]:
split = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1 ,test_size=0.20)
for train_index, test_index in split.split(housing,housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
print(strat_train_set.shape)
print(strat_test_set.shape)

In [None]:
print(strat_test_set["income_cat"].value_counts() / len(strat_test_set))

In [None]:
#Removing the income cat column we added for strafied data
housing.drop(columns=["income cat"], inplace=True)

In [None]:
housing_new = housing.drop(columns=["ocean_proximity"])
corr_matrix = housing_new.corr()
sns.heatmap(corr_matrix, annot=True)
plt.show()


In [None]:
housing['rooms_per_household'] = housing['total_rooms']/housing['households']
housing['bedrooms_per_room'] = housing['total_rooms']/housing['total_bedrooms']
housing['population_per_household'] = housing['population']/housing['households']

housing_num = housing.drop(columns = ['ocean_proximity'],axis = 1)


In [None]:
housing = strat_train_set.drop('median_income',axis = 1)
housing_labels = strat_train_set['median_income']
median = housing["total_bedrooms"].median()
housing["total_bedrooms"].fillna(median, inplace=True)


In [28]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')

In [31]:
housing_num = housing.drop('ocean_proximity',axis = 1)
imputer.fit(housing_num)
print(imputer.statistics_)

X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns)

[-1.1848e+02  3.4250e+01  2.9000e+01  2.1320e+03  4.3600e+02  1.1670e+03
  4.1000e+02  1.7955e+05  3.0000e+00]


In [38]:
housing_cat =  housing[['ocean_proximity']]


from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_Cat_1h = cat_encoder.fit_transform(housing_cat)


  (np.int32(0), np.int32(0))	1.0
  (np.int32(1), np.int32(0))	1.0
  (np.int32(2), np.int32(0))	1.0
  (np.int32(3), np.int32(0))	1.0
  (np.int32(4), np.int32(0))	1.0
  (np.int32(5), np.int32(1))	1.0
  (np.int32(6), np.int32(0))	1.0
  (np.int32(7), np.int32(1))	1.0
  (np.int32(8), np.int32(1))	1.0
  (np.int32(9), np.int32(1))	1.0
  (np.int32(10), np.int32(0))	1.0
  (np.int32(11), np.int32(4))	1.0
  (np.int32(12), np.int32(4))	1.0
  (np.int32(13), np.int32(0))	1.0
  (np.int32(14), np.int32(1))	1.0
  (np.int32(15), np.int32(3))	1.0
  (np.int32(16), np.int32(1))	1.0
  (np.int32(17), np.int32(4))	1.0
  (np.int32(18), np.int32(0))	1.0
  (np.int32(19), np.int32(1))	1.0
  (np.int32(20), np.int32(0))	1.0
  (np.int32(21), np.int32(0))	1.0
  (np.int32(22), np.int32(0))	1.0
  (np.int32(23), np.int32(0))	1.0
  (np.int32(24), np.int32(0))	1.0
  :	:
  (np.int32(16487), np.int32(1))	1.0
  (np.int32(16488), np.int32(0))	1.0
  (np.int32(16489), np.int32(1))	1.0
  (np.int32(16490), np.int32(3))	1.0
  (np.

In [41]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer  # Ensure you have this import
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Corrected
    ('std_scaler', StandardScaler())               # Corrected
])

housing_num_Tr = num_pipeline.fit_transform(housing_num)



In [42]:
from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])
housing_prepared = full_pipeline.fit_transform(housing)

In [49]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score,cross_val_predict

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

cross_val_predictions = cross_val_predict(lin_reg, housing_prepared, housing_labels)

In [48]:
housing_test = strat_train_set.drop('median_income',axis = 1)
housing_labels = strat_train_set['median_income']
housing_prepared = full_pipeline.fit_transform(housing_test)

housing_predicted = lin_reg.predict(housing_prepared)

(16512,) (16512, 10)


In [55]:
from sklearn.metrics import r2_score, mean_squared_error
lin_mse = mean_squared_error(housing_predicted, housing_labels)
lin_rmse = np.sqrt(lin_mse)

print(lin_rmse)
print(housing_predicted)
print(np.array(housing_labels))



0.7385016910707171
[5.01912503 5.12915028 5.21376514 ... 2.79842615 3.96796683 3.70763467]
[4.6389 5.2323 4.9375 ... 3.089  3.9844 3.6667]
