# K-means Model for House classifying system using California housing dataset 

## 1. Import modules 

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

from utils import cross_val

## 2. Data preparation 
### 2.1 Data Download 

In [2]:
data_df=pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/k-means-project-tutorial/main/housing.csv')
print(data_df)

       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
...       ...       ...       ...        ...         ...       ...       ...   
20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   

       Longitude  MedHouseVal  
0      

### 2.2 Data inspection 

In [3]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [5]:
data_df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [6]:
data_df.head().T

Unnamed: 0,0,1,2,3,4
MedInc,8.3252,8.3014,7.2574,5.6431,3.8462
HouseAge,41.0,21.0,52.0,52.0,52.0
AveRooms,6.984127,6.238137,8.288136,5.817352,6.281853
AveBedrms,1.02381,0.97188,1.073446,1.073059,1.081081
Population,322.0,2401.0,496.0,558.0,565.0
AveOccup,2.555556,2.109842,2.80226,2.547945,2.181467
Latitude,37.88,37.86,37.85,37.85,37.85
Longitude,-122.23,-122.22,-122.24,-122.25,-122.25
MedHouseVal,4.526,3.585,3.521,3.413,3.422


### 2.3 Filter only required features 

In [7]:
data_df=data_df[['MedInc', 'Latitude', 'Longitude']]
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   MedInc     20640 non-null  float64
 1   Latitude   20640 non-null  float64
 2   Longitude  20640 non-null  float64
dtypes: float64(3)
memory usage: 483.9 KB


In [8]:
data_df.describe()

Unnamed: 0,MedInc,Latitude,Longitude
count,20640.0,20640.0,20640.0
mean,3.870671,35.631861,-119.569704
std,1.899822,2.135952,2.003532
min,0.4999,32.54,-124.35
25%,2.5634,33.93,-121.8
50%,3.5348,34.26,-118.49
75%,4.74325,37.71,-118.01
max,15.0001,41.95,-114.31


In [9]:
data_df.head().T

Unnamed: 0,0,1,2,3,4
MedInc,8.3252,8.3014,7.2574,5.6431,3.8462
Latitude,37.88,37.86,37.85,37.85,37.85
Longitude,-122.23,-122.22,-122.24,-122.25,-122.25


### 2.4 Test-train split 

In [11]:
training_data, testing_data=train_test_split(data_df, test_size=0.2, random_state=42)
training_data.info()
testing_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16512 entries, 14196 to 15795
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   MedInc     16512 non-null  float64
 1   Latitude   16512 non-null  float64
 2   Longitude  16512 non-null  float64
dtypes: float64(3)
memory usage: 516.0 KB
<class 'pandas.core.frame.DataFrame'>
Index: 4128 entries, 20046 to 3665
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   MedInc     4128 non-null   float64
 1   Latitude   4128 non-null   float64
 2   Longitude  4128 non-null   float64
dtypes: float64(3)
memory usage: 129.0 KB
