In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import joblib

In [26]:
df = pd.read_csv("housing.csv")
df.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
10590,-117.78,33.69,16.0,4702.0,806.0,2529.0,814.0,5.1299,238900.0,<1H OCEAN
2538,-124.17,40.8,52.0,661.0,316.0,392.0,244.0,0.957,60000.0,NEAR OCEAN
18568,-121.75,36.92,48.0,1801.0,353.0,1071.0,361.0,3.6,194500.0,<1H OCEAN
3575,-118.52,34.23,35.0,1471.0,210.0,735.0,219.0,8.3841,472200.0,<1H OCEAN
11530,-118.08,33.76,25.0,1995.0,637.0,743.0,597.0,1.4617,46900.0,<1H OCEAN


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [17]:
df.isnull().sum()

longitude                0
latitude                 0
housing_median_age       0
total_rooms              0
total_bedrooms           0
population               0
households               0
median_income            0
median_house_value       0
ocean_proximity       4948
dtype: int64

In [18]:
df["total_bedrooms"].value_counts()

total_bedrooms
435.0     244
280.0      55
331.0      51
345.0      50
343.0      49
         ... 
2961.0      1
1915.0      1
1215.0      1
3298.0      1
1052.0      1
Name: count, Length: 1923, dtype: int64

In [19]:
df["total_bedrooms"].describe()

count    20640.000000
mean       536.838857
std        419.391878
min          1.000000
25%        297.000000
50%        435.000000
75%        643.250000
max       6445.000000
Name: total_bedrooms, dtype: float64

In [20]:
df["total_bedrooms"].median()

435.0

In [29]:
# used imputer for ocean_proximity columns
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

imp.fit(df[["ocean_proximity"]])

df[["ocean_proximity"]] = imp.transform(df[["ocean_proximity"]])

df.isnull().sum()


longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [35]:
from sklearn.preprocessing import OneHotEncoder

# Assuming X_train contains your feature data including 'ocean_proximity'
encoder = OneHotEncoder(sparse=False)
X_train_encoded = encoder.fit_transform(X_train[['ocean_proximity']])




In [37]:
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [41]:
df["ocean_proximity"].value_counts()

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

In [42]:
df_encoded = pd.get_dummies(df, columns=['ocean_proximity'])

In [43]:
# split the data into X and y
X = df_encoded.drop("median_house_value", axis=1)
y = df_encoded["median_house_value"]

In [44]:
# SPlit the the data into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16512, 13), (4128, 13), (16512,), (4128,))

In [45]:
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)

In [46]:
model.score(X_test, y_test)

0.6254240620553597

In [47]:
joblib.dump(model, "linear_regression_model.pkl")


['linear_regression_model.pkl']

In [48]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')