In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import joblib

In [2]:
df = pd.read_csv("housing.csv")
df.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
12061,-117.57,33.87,27.0,1786.0,287.0,939.0,278.0,5.1929,165000.0,INLAND
12566,-121.45,38.54,48.0,3421.0,734.0,1441.0,727.0,1.9485,86600.0,INLAND
15266,-117.27,33.04,27.0,1839.0,392.0,1302.0,404.0,3.55,214600.0,NEAR OCEAN
8294,-118.13,33.76,44.0,1543.0,463.0,652.0,406.0,4.25,439300.0,NEAR OCEAN
15930,-122.43,37.73,52.0,1985.0,401.0,1337.0,424.0,4.1071,240900.0,NEAR BAY


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [4]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [5]:
df["total_bedrooms"].value_counts()

total_bedrooms
280.0     55
331.0     51
345.0     50
343.0     49
393.0     49
          ..
2961.0     1
1915.0     1
1215.0     1
3298.0     1
1052.0     1
Name: count, Length: 1923, dtype: int64

In [6]:
df["total_bedrooms"].describe()

count    20433.000000
mean       537.870553
std        421.385070
min          1.000000
25%        296.000000
50%        435.000000
75%        647.000000
max       6445.000000
Name: total_bedrooms, dtype: float64

In [7]:
df["total_bedrooms"].median()

435.0

In [8]:
# used imputer for ocean_proximity columns
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

imp.fit(df[["ocean_proximity"]])

df[["ocean_proximity"]] = imp.transform(df[["ocean_proximity"]])

df.isnull().sum()


longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [9]:
# impute null values into median for total_bedrooms
df["total_bedrooms"].fillna(df["total_bedrooms"].median(), inplace=True)

In [10]:
df["ocean_proximity"].value_counts()

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

In [11]:
# Assuming you have a DataFrame with the columns containing non-numeric characters
# For example, let's say you have a DataFrame 'data' with a column 'longitude' containing values like '-122.23\t'

# Remove non-numeric characters from the 'longitude' column
df['longitude'] = df['longitude'].str.replace('\t', '')

# Convert the 'longitude' column to integers
df['longitude'] = df['longitude'].astype(int)


AttributeError: Can only use .str accessor with string values!

In [12]:
# Encode the categorical column into int
df["ocean_proximity"] = df["ocean_proximity"].map({"<1H OCEAN":0, "INLAND":1, "NEAR OCEAN":2, "NEAR BAY":3, "ISLAND":4})

In [13]:
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [14]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,3
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,3
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,3
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,3
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,3


In [15]:
# split the data into X and y
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]

In [16]:
# SPlit the the data into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16512, 9), (4128, 9), (16512,), (4128,))

In [17]:
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)

In [18]:
model.score(X_test, y_test)

0.6133745270468163

In [19]:
joblib.dump(model, "model.pkl")


['model.pkl']

In [20]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [21]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,3
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,3
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,3
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,3
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,3
