# <center> Introduction to Data Science and Computing - Fall 2021

### Importing Modules

In [120]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### Read in all our data

In [121]:
df = pd.read_csv('Data/housing.csv') # Notice: Raw data is in the Data folder
rows,columns = df.shape
print("Number of rows: ", rows)
print("Number of columns: ", columns)

Number of rows:  20640
Number of columns:  10


### Set seed for reproducibility

In [122]:
np.random.seed(0)

### Check if there are any missing values (NaN or None)


In [123]:
df.head(n=5) ### look at the first five rows of dataframe

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


### How many missing data points do we have?
### Get the number of missing data points per column

In [124]:
missing_values_count = df.isnull().sum()
missing_values_count[:]

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

### It's helpful to see what percentage of the values in our dataset were missing. It gives us a better sense of the scale of this problem

In [125]:
total_cells   = np.product(df.shape)
total_missing = missing_values_count.sum()
percent_missing = (total_missing/total_cells)*100
print('Percent of data that is missing:', percent_missing)

Percent of data that is missing: 0.1002906976744186


### Take a closer look at some of the columns with missing values
## >>>>  total_bedrooms 
### These values are probably missing because they were not recorded, rather than because they  don't exist. So, it would make sense to try and guess what they should be rather than just leaving them as NA's.

### Data Imputation 

In [126]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = "median")
housing_numerical_attributes = df.drop("ocean_proximity", axis = 1)
imputer.fit(housing_numerical_attributes)  
X = imputer.transform(housing_numerical_attributes)

### New data frame with replaced NA values


In [130]:
new_df = pd.DataFrame(X, columns = housing_numerical_attributes.columns, index = housing_numerical_attributes.index)

new_df.head(n=5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0
