In [9]:
# Written by: Isidora Fletcher
# This jupyter notebook contains the preprocessing step from Isi, Albina, Pragya and Ehsan together.

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [10]:
# Getting the raw data
df = pd.read_csv('housing.csv') # Notice: Raw data is in the Data folder

# Creating instance for label encoder.
le = LabelEncoder()

# Asssigning numerical values to ocean proximity.
# I changed the name from housing (in original code) to df to have consistency between Pragya's and Ehsan's codes.
df["ocean_proximity"]= le.fit_transform(df["ocean_proximity"])
print(df["ocean_proximity"])


# Observing missing values
missing_values_count = df.isnull().sum()
missing_values_count[:]

total_cells   = np.product(df.shape)
total_missing = missing_values_count.sum()
percent_missing = (total_missing/total_cells)*100
print('Percent of data that is missing:', percent_missing)

imputer = SimpleImputer(strategy = "median")
housing_numerical_attributes = df.drop("ocean_proximity", axis = 1) # We need to see how to put this back.
imputer.fit(housing_numerical_attributes)  
X = imputer.transform(housing_numerical_attributes)

# Data with replaced NA values.
# I changed the name from new_df (in original code) to housing.
housing = pd.DataFrame(X, columns = housing_numerical_attributes.columns, index = housing_numerical_attributes.index)

housing.insert(9,"ocean_proximity",df["ocean_proximity"],True)
print(housing)

# Splitting the data into training and testing sets.
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=1)

# Creating pandas series full of zeros to store the standard deviation and the mean from the training set.
std_dev_tr= pd.Series({col:0 for col in train_set.columns}, dtype="float32")
mean_tr= pd.Series({col:0 for col in train_set.columns}, dtype="float32")

# Getting the values for the mean and standard deviation from the training dataset.
for col in train_set.columns:
    std_dev_tr[col]= train_set[col].std()
    mean_tr[col]= train_set[col].mean()
    # Changing the training data so it is normalized with the mean and standard deviation from the training set.
    train_set[col]=(train_set[col]-mean_tr[col])/std_dev_tr[col]

for col in test_set.columns:
    # Changing the testing data so it is normalized with the mean and standard deviation from the training set.
    test_set[col]=(test_set[col]-mean_tr[col])/std_dev_tr[col]    



0        3
1        3
2        3
3        3
4        3
5        3
6        3
7        3
8        3
9        3
10       3
11       3
12       3
13       3
14       3
15       3
16       3
17       3
18       3
19       3
20       3
21       3
22       3
23       3
24       3
25       3
26       3
27       3
28       3
29       3
        ..
20610    1
20611    1
20612    1
20613    1
20614    1
20615    1
20616    1
20617    1
20618    1
20619    1
20620    1
20621    1
20622    1
20623    1
20624    1
20625    1
20626    1
20627    1
20628    1
20629    1
20630    1
20631    1
20632    1
20633    1
20634    1
20635    1
20636    1
20637    1
20638    1
20639    1
Name: ocean_proximity, Length: 20640, dtype: int32
Percent of data that is missing: 0.1002906976744186
       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0        -122.23     37.88                41.0        880.0           129.0   
1        -122.22     37.86                21.0       7099.0         

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [16]:
#### Code that also uses tesing set for normalization.####
## Creating pandas series full of zeros to store the standard deviation and the mean from the training set.
#std_dev_tr= pd.Series({col:0 for col in train_set.columns}, dtype="float32")
#mean_tr= pd.Series({col:0 for col in train_set.columns}, dtype="float32")

## Creating pandas series full of zeros to store the standard deviation and the mean from the testing set.
#std_dev_te= pd.Series({col:0 for col in train_set.columns}, dtype="float32")
#mean_te= pd.Series({col:0 for col in train_set.columns}, dtype="float32")

## Getting the values for the mean and standard deviation from the training dataset.
#for col in train_set.columns:
#    std_dev_tr[col]= train_set[col].std()
#    mean_tr[col]= train_set[col].mean()
#    # Changing the training data so it is normalized with the mean and standard deviation from the training set.
#    train_set[col]=(train_set[col]-mean_tr[col])/std_dev_tr[col]

#for col in test_set.columns:
#    std_dev_te[col]= test_set[col].std()
#    mean_te[col]= test_set[col].mean()
#    # Changing the testing data so it is normalized with the mean and standard deviation from the training set.
#    test_set[col]=(test_set[col]-mean_te[col])/std_dev_te[col]    


In [17]:
print(test_set.head())
print(train_set.head())

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
4712    0.604852 -0.735672            0.825957     0.078298        0.315029   
2151   -0.102468  0.537089            0.667051    -0.208871       -0.198174   
15927  -1.417486  0.981619            1.382128    -0.377037       -0.303201   
82     -1.347750  1.019053            1.858847    -1.056592       -1.052717   
8161    0.719418 -0.847974            0.667051    -0.509823       -0.592027   

       population  households  median_income  median_house_value  \
4712    -0.281429    0.322682      -0.331018            1.273978   
2151    -0.251173   -0.163669      -1.003259           -1.185480   
15927    0.096767   -0.247342       0.072453            0.187422   
82      -1.090322   -1.081461      -1.245173           -0.823872   
8161    -0.635598   -0.568962       0.689026            0.152818   

       ocean_proximity  
4712         -0.819269  
2151         -0.116820  
15927         1.288079  
82            1.