# Lab 2

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the data
housing = pd.read_csv('https://raw.githubusercontent.com/thomouvic/SENG474/main/data/housing.csv')
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
# Check for any invalid values
housing.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

# Attribute Combinations

In [4]:
# Experimenting with attribute combinations
housing["rooms_per_house"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_ratio"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_house"]=housing["population"]/housing["households"]

In [5]:
# Make a correlation matrix and see how our new features correlate to what we want to predict
corr_matrix = housing.corr(numeric_only=True)
corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value      1.000000
median_income           0.688075
rooms_per_house         0.151948
total_rooms             0.134153
housing_median_age      0.105623
households              0.065843
total_bedrooms          0.049686
population_per_house   -0.023737
population             -0.024650
longitude              -0.045967
latitude               -0.144160
bedrooms_ratio         -0.255880
Name: median_house_value, dtype: float64

# Prepare data for ML

In [6]:
# Reload the original data
housing = pd.read_csv('https://raw.githubusercontent.com/thomouvic/SENG474/main/data/housing.csv')

# Create the stratified sampling of the data based on income categories
from sklearn.model_selection import train_test_split

housing["income_cat"] = pd.cut(housing["median_income"], 
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf], 
                               labels=[1, 2, 3, 4, 5])

strat_train_set, strat_test_set = train_test_split(housing, test_size=0.2, random_state=42,
                                                   stratify=housing["income_cat"])

# Drop income_cat column. We won't be needing it anymore.
strat_train_set.drop("income_cat", axis=1, inplace=True)
strat_test_set.drop("income_cat", axis=1, inplace=True)

# Split the data frames into features and labels
# Note that features are still Data Frame objects but the labels are Series objects
housing_train = strat_train_set.drop("median_house_value", axis=1)
housing_labels_train = strat_train_set["median_house_value"].copy()

housing_test = strat_test_set.drop("median_house_value", axis=1)
housing_labels_test = strat_test_set["median_house_value"].copy()

print(housing_train.head())
print()
print(housing_labels_train.head())
print()
print(housing_test.head())
print()
print(housing_labels_test.head())


       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
13096    -122.42     37.80                52.0       3321.0          1115.0   
14973    -118.38     34.14                40.0       1965.0           354.0   
3785     -121.98     38.36                33.0       1083.0           217.0   
14689    -117.11     33.75                17.0       4174.0           851.0   
20507    -118.15     33.77                36.0       4366.0          1211.0   

       population  households  median_income ocean_proximity  
13096      1576.0      1034.0         2.0987        NEAR BAY  
14973       666.0       357.0         6.0876       <1H OCEAN  
3785        562.0       203.0         2.4330          INLAND  
14689      1845.0       780.0         2.2618          INLAND  
20507      1912.0      1172.0         3.5292      NEAR OCEAN  

13096    458300.0
14973    483800.0
3785     101700.0
14689     96100.0
20507    361800.0
Name: median_house_value, dtype: float64

       longitu

### Clean the Data
Most machine learning algorithms cannot work with missing features, so you’ll need to take
care of these. Imputation is a common way of filling in missing values.

In [7]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

In [8]:
# Make a numerical only version of the training data and fit the imputer to that
housing_num = housing_train.select_dtypes(include=[np.number])
imputer.fit(housing_num)

In [9]:
# As you can see we have fit the imputer to the median of the training data
print(housing_num.median())
print(imputer.statistics_)

longitude             -118.5100
latitude                34.2600
housing_median_age      29.0000
total_rooms           2125.0000
total_bedrooms         434.0000
population            1167.0000
households             408.0000
median_income            3.5385
dtype: float64
[-118.51     34.26     29.     2125.      434.     1167.      408.
    3.5385]


In [10]:
# Finally lets take our numerical training data and impute the missing values
X = imputer.transform(housing_num)

# This transforms our data frame into an numpy array of feature vectors with imputation
print(X.shape)

# For comparision
print(housing_num.head(1))
print(X[0])

(16512, 8)
       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
13096    -122.42      37.8                52.0       3321.0          1115.0   

       population  households  median_income  
13096      1576.0      1034.0         2.0987  
[-1.2242e+02  3.7800e+01  5.2000e+01  3.3210e+03  1.1150e+03  1.5760e+03
  1.0340e+03  2.0987e+00]


In [11]:
# Assuming our data was nice we could now convert our labels and begin working with models
y = np.asarray(housing_labels_train)

print(X[:5])
print(y[:5])

[[-1.2242e+02  3.7800e+01  5.2000e+01  3.3210e+03  1.1150e+03  1.5760e+03
   1.0340e+03  2.0987e+00]
 [-1.1838e+02  3.4140e+01  4.0000e+01  1.9650e+03  3.5400e+02  6.6600e+02
   3.5700e+02  6.0876e+00]
 [-1.2198e+02  3.8360e+01  3.3000e+01  1.0830e+03  2.1700e+02  5.6200e+02
   2.0300e+02  2.4330e+00]
 [-1.1711e+02  3.3750e+01  1.7000e+01  4.1740e+03  8.5100e+02  1.8450e+03
   7.8000e+02  2.2618e+00]
 [-1.1815e+02  3.3770e+01  3.6000e+01  4.3660e+03  1.2110e+03  1.9120e+03
   1.1720e+03  3.5292e+00]]
[458300. 483800. 101700.  96100. 361800.]


# Transformation Pipelines
We still have more preproccessing of the data we would like to perform, so in order to make things more managable, we should make a transformation pipeline that can apply multiple transformations in sequence.

In [12]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

num_pipeline = Pipeline([("imputer", SimpleImputer(strategy="median")), ("standardize", StandardScaler())])

# If you don't want to name transformers you can do it this way as well
#num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
num_pipeline


In [13]:
# Apply the transformations via the pipeline and view the results
# num_pipeline.fit(housing_num)
# housing_num_tr = num_pipeline.transform(housing_num)

# fit_transform() is a convenience function that calls the above two functions in order
housing_num_tr = num_pipeline.fit_transform(housing_num)

print(housing_num_tr[:5])

[[-1.42303652  1.0136059   1.86111875  0.31191221  1.36816703  0.13746004
   1.39481249 -0.93649149]
 [ 0.59639445 -0.702103    0.90762971 -0.30861991 -0.43592476 -0.69377062
  -0.37348471  1.17194198]
 [-1.2030985   1.27611874  0.35142777 -0.71224036 -0.76070869 -0.78876841
  -0.77572662 -0.75978881]
 [ 1.23121557 -0.88492444 -0.91989094  0.70226169  0.74230601  0.38317548
   0.73137454 -0.85028088]
 [ 0.71136206 -0.87554898  0.58980003  0.79012465  1.59575285  0.44437597
   1.75526303 -0.18036472]]


In [14]:
# Convert numpy array back to a data frame and print the head
df_housing_num_tr = pd.DataFrame(housing_num_tr, columns=housing_num.columns)
print(df_housing_num_tr.head())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0  -1.423037  1.013606            1.861119     0.311912        1.368167   
1   0.596394 -0.702103            0.907630    -0.308620       -0.435925   
2  -1.203098  1.276119            0.351428    -0.712240       -0.760709   
3   1.231216 -0.884924           -0.919891     0.702262        0.742306   
4   0.711362 -0.875549            0.589800     0.790125        1.595753   

   population  households  median_income  
0    0.137460    1.394812      -0.936491  
1   -0.693771   -0.373485       1.171942  
2   -0.788768   -0.775727      -0.759789  
3    0.383175    0.731375      -0.850281  
4    0.444376    1.755263      -0.180365  


In [15]:
# Compare the two data frames
print(housing_num.head())
print()
print(df_housing_num_tr.head())

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
13096    -122.42     37.80                52.0       3321.0          1115.0   
14973    -118.38     34.14                40.0       1965.0           354.0   
3785     -121.98     38.36                33.0       1083.0           217.0   
14689    -117.11     33.75                17.0       4174.0           851.0   
20507    -118.15     33.77                36.0       4366.0          1211.0   

       population  households  median_income  
13096      1576.0      1034.0         2.0987  
14973       666.0       357.0         6.0876  
3785        562.0       203.0         2.4330  
14689      1845.0       780.0         2.2618  
20507      1912.0      1172.0         3.5292  

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0  -1.423037  1.013606            1.861119     0.311912        1.368167   
1   0.596394 -0.702103            0.907630    -0.308620       -0.435925   
2  -1.203098  1.2

# STUDENT SECTION

In [16]:
# Load the data set and print the first 5 entries
# https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset
from sklearn.datasets import load_diabetes
data = load_diabetes(as_frame=True, scaled = False)
diabetes = data['frame']
diabetes.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,59.0,2.0,32.1,101.0,157.0,93.2,38.0,4.0,4.8598,87.0,151.0
1,48.0,1.0,21.6,87.0,183.0,103.2,70.0,3.0,3.8918,69.0,75.0
2,72.0,2.0,30.5,93.0,156.0,93.6,41.0,4.0,4.6728,85.0,141.0
3,24.0,1.0,25.3,84.0,198.0,131.4,40.0,5.0,4.8903,89.0,206.0
4,50.0,1.0,23.0,101.0,192.0,125.4,52.0,4.0,4.2905,80.0,135.0


In [17]:
# Check for any invalid values

# Attribute Combinations

In [18]:
# Experiment with different attribute combinations (Please make at least 3)
# You are welcome to experiment with these however you wish
# Just don't include target as an attribute as it is what we are trying to predict
# diabetes["Combination_1"] = 
# diabetes["Combination_2"] = 
# diabetes["Combination_3"] = 

In [19]:
# Make a correlation matrix and see how our new features correlate to what we want to predict ("target")

# Prepare data for ML

In [20]:
# Reload the original data
data = load_diabetes(as_frame=True, scaled = False)
diabetes = data['frame']

# Create stratified train and test sets as in Lab 1
diabetes["bmi_cat"] = pd.cut(diabetes["bmi"],
                               bins=[0., 18.5, 25., 30., np.inf],
                               labels=[1, 2, 3, 4])

strat_train_set, strat_test_set = train_test_split(diabetes, test_size=0.2, random_state=42,
                                                   stratify=diabetes["bmi_cat"])

# Drop the bmi_cat column once the training and test sets have been made


# Split the data frames into features and labels


# Print the head of each data frame
# Note that features are still Data Frame objects but the labels are Series objects


# Transformation Pipelines

In [21]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

# Create a pipeline with a StandardScaler and print the pipeline

In [22]:
# Transform the training set using the new pipeline and print the first 5 values of the results

In [23]:
# Convert the numpy array back to a data frame and print the head