In [None]:
# Predictive Modeling

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

In [None]:
""" many algorithms will execute faster with smaller samples.
Accurate models can often be built with as few as several thousand records. Hence, we
will want to sample a subset of records for model building."""

In [2]:
#Read a csv file to a dataFrame WestRox
housing_df = pd.read_csv('https://raw.githubusercontent.com/reisanar/datasets/master/WestRoxbury.csv')

In [3]:
housing_df.to_csv('E:/WestRoxbury.csv')   # writing to E drive

In [3]:
housing_df.head(2)

Unnamed: 0,TOTAL VALUE,TAX,LOT SQFT,YR BUILT,GROSS AREA,LIVING AREA,FLOORS,ROOMS,BEDROOMS,FULL BATH,HALF BATH,KITCHEN,FIREPLACE,REMODEL
0,344.2,4330,9965,1880,2436,1352,2.0,6,3,1,1,1,0,
1,412.6,5190,6590,1945,3108,1976,2.0,10,4,2,1,1,0,Recent


In [4]:
type(housing_df.REMODEL.values)

numpy.ndarray

In [5]:
housing_df.REMODEL.dtype

dtype('O')

In [7]:
housing_df.sample(2)

Unnamed: 0,TOTAL VALUE,TAX,LOT SQFT,YR BUILT,GROSS AREA,LIVING AREA,FLOORS,ROOMS,BEDROOMS,FULL BATH,HALF BATH,KITCHEN,FIREPLACE,REMODEL
1956,381.5,4799,5275,1930,2616,1568,2.0,6,3,1,1,1,1,
2189,392.5,4937,5429,1930,2704,1544,2.0,7,3,1,0,1,0,


In [8]:
housing_df.columns # print a list of variables
# REMODEL needs to be converted to a categorical variable

Index(['TOTAL VALUE ', 'TAX', 'LOT SQFT ', 'YR BUILT', 'GROSS AREA ',
       'LIVING AREA', 'FLOORS ', 'ROOMS', 'BEDROOMS ', 'FULL BATH',
       'HALF BATH', 'KITCHEN', 'FIREPLACE', 'REMODEL'],
      dtype='object')

In [9]:
housing_df.columns = [s.strip().replace(" " , "_") for s in housing_df.columns]

In [10]:
housing_df.REMODEL = housing_df.REMODEL.astype("category")

In [11]:
housing_df.REMODEL.cat.categories # Show number of categories

Index(['None', 'Old', 'Recent'], dtype='object')

In [12]:
housing_df.REMODEL.dtype # Check type of converted variable

CategoricalDtype(categories=['None', 'Old', 'Recent'], ordered=False)

In [14]:
#Creating Dummy Variables in pandas

In [13]:
housing_df.columns

Index(['TOTAL_VALUE', 'TAX', 'LOT_SQFT', 'YR_BUILT', 'GROSS_AREA',
       'LIVING_AREA', 'FLOORS', 'ROOMS', 'BEDROOMS', 'FULL_BATH', 'HALF_BATH',
       'KITCHEN', 'FIREPLACE', 'REMODEL'],
      dtype='object')

In [14]:
# use drop_first=True to drop the first dummy variable (There are 3 ie 'None', 'Old', 'Recent')
housing_df = pd.get_dummies(housing_df, prefix_sep="_", drop_first=True)

In [15]:
housing_df.columns

Index(['TOTAL_VALUE', 'TAX', 'LOT_SQFT', 'YR_BUILT', 'GROSS_AREA',
       'LIVING_AREA', 'FLOORS', 'ROOMS', 'BEDROOMS', 'FULL_BATH', 'HALF_BATH',
       'KITCHEN', 'FIREPLACE', 'REMODEL_Old', 'REMODEL_Recent'],
      dtype='object')

In [16]:
housing_df.loc[:, "REMODEL_Old":"REMODEL_Recent"].head(2)

Unnamed: 0,REMODEL_Old,REMODEL_Recent
0,0,0
1,0,1


In [24]:
#The median is used for imputation, rather than the mean, to preserve the
#integer nature of the counts for bedrooms.

In [17]:
# To illustrate missing data procedures, we first convert a few entries for
# bedrooms to NA’s. Then we impute these missing values using the median of the
# remaining values.
missingRows = housing_df.sample(10).index
missingRows

Int64Index([5762, 2614, 2875, 5401, 2131, 5416, 5225, 1530, 1377, 382], dtype='int64')

In [18]:
housing_df.loc[missingRows, "BEDROOMS"] = np.nan  # removing the values for experiment

In [19]:
print("Number of rows with valid BEDROOMS values after setting to NAN: ",housing_df["BEDROOMS"].count())

Number of rows with valid BEDROOMS values after setting to NAN:  5792


In [20]:
# remove rows with missing values
reduced_df = housing_df.dropna()

In [21]:
print("Number of rows after removing rows with missing values: ",len(reduced_df))

Number of rows after removing rows with missing values:  5792


In [22]:
# replace the missing values using the median of the remaining values.
medianBedrooms = housing_df["BEDROOMS"].median()

In [23]:
medianBedrooms

3.0

In [24]:
housing_df.BEDROOMS = housing_df.BEDROOMS.fillna(value=medianBedrooms)

In [25]:
print("Number of rows with valid BEDROOMS values after filling NA values: ",housing_df["BEDROOMS"].count())

Number of rows with valid BEDROOMS values after filling NA values:  5802


In [None]:
# pp64

In [None]:
Normalizing (Standardizing) and Rescaling Data. This operation is also sometimes called standardizing.

In [None]:
using the methods mean and std; (df - df.mean()) / df.std(). In effect, we
are expressing each value as the “number of standard deviations away from the mean,”
also called a z-score. An alternative is the class StandardScaler(), which is one of a
number different transformers available in scikit-learn. You can use the methods fit() or
fit_transform() to train the transformer on the training set and the method transform()
to apply on the validation set. The result of the transformation is no longer a pandas
dataframe, however, you can convert it back into one easily.

In [27]:
#housing_df.mean()

In [28]:
#housing_df.std()

In [24]:
#housing_df...Now take out the last column...error in " norm_df = (housing_df - housing_df.mean()) / housing_df.std()""

In [29]:
pp = housing_df.iloc[:,0:13]

In [30]:
pp.head(2)

Unnamed: 0,TOTAL_VALUE,TAX,LOT_SQFT,YR_BUILT,GROSS_AREA,LIVING_AREA,FLOORS,ROOMS,BEDROOMS,FULL_BATH,HALF_BATH,KITCHEN,FIREPLACE
0,344.2,4330,9965,1880,2436,1352,2.0,6,3.0,1,1,1,0
1,412.6,5190,6590,1945,3108,1976,2.0,10,4.0,2,1,1,0


In [31]:
house = (pp - pp.mean())/ pp.std()

In [32]:
house.head(2)

Unnamed: 0,TOTAL_VALUE,TAX,LOT_SQFT,YR_BUILT,GROSS_AREA,LIVING_AREA,FLOORS,ROOMS,BEDROOMS,FULL_BATH,HALF_BATH,KITCHEN,FIREPLACE
0,-0.488879,-0.488507,1.381019,-1.57669,-0.552998,-0.564458,0.710905,-0.69198,-0.271728,-0.568528,0.723202,-0.124803,-1.309337
1,0.200795,0.200789,0.116835,0.229372,0.207196,0.590121,0.710905,2.090325,0.910104,1.347035,0.723202,-0.124803,-1.309337


In [None]:
Normalizing is one way to bring all variables to the same scale. Another popular approach
is rescaling each variable to a [0, 1] scale. This is done by subtracting the minimum value
and then dividing by the range. Subtracting the minimum shifts the variable origin to
zero. Dividing by the range shrinks or expands the data to the range [0, 1]. In pandas, use
the expression (df-df.min())/ (df.max()-df.min()).

In [33]:
norm_df = (pp - pp.min()) / (pp.max() - pp.min())

In [4]:
#norm_df.head()

In [26]:
# refer pp65 for scikit-learn:

In [None]:
""" To consider why normalizing or scaling to [0, 1] might be necessary, consider the case of
clustering. Clustering typically involves calculating a distance measure that reflects how
far each record is from a cluster center or from other records. With multiple variables,
different units will be used: days, dollars, counts, and so on. If the dollars are in the
thousands and everything else is in the tens, the dollar variable will come to dominate the
distance measure. Moreover, changing units from, say, days to hours or months, could
alter the outcome completely."""

In [None]:
2.5 Predictive Power and Overfitting

In [None]:
In supervised learning, a key question presents itself: How well will our prediction or
classification model perform when we apply it to new data? We are particularly interested
in comparing the performance of various models so that we can choose the one we think
will do the best when it is implemented in practice. A key concept is to make sure that our
chosen model generalizes beyond the dataset that we have at hand. To assure
generalization, we use the concept of data partitioning and try to avoid overfitting. These
two important concepts are described next.

In [None]:
#overfitting ---Overfitting: This function fits the data with no error see pp67

In [None]:
Somewhat surprisingly, even if we know for a fact that a higher-degree curve is the
appropriate model, if the model-fitting dataset is not large enough, a lower-degree
function (that is not as likely to fit the noise) is likely to perform better in terms of
predicting new values. Overfitting can also result from the application of many different
models, from which the best performing model is selected.

In [None]:
Creation and Use of Data Partitions

In [None]:
Training Partition
The training partition, typically the largest partition, contains the data used to build the
various models we are examining. The same training partition is generally used to
develop multiple models.
Validation Partition
The validation partition (sometimes called the test partition) is used to assess the
predictive performance of each model so that you can compare models and choose the
best one. In some algorithms (e.g., classification and regression trees, k-nearest
neighbors), the validation partition may be used in an automated fashion to tune and
improve the model.
Test Partition
The test partition (sometimes called the holdout or evaluation partition) is used to assess
the performance of the chosen model with new data.
Why have both a validation and a test partition? When we use the validation data to
assess multiple models and then choose the model that performs best with the validation
data, we again encounter another (lesser) facet of the overfitting problem—chance aspects
of the validation data that happen to match the chosen model better than they match
other models. In other words, by using the validation data to choose one of several
models, the performance of the chosen model on the validation data will be overly
optimistic.
The random features of the validation data that enhance the apparent performance of the
chosen model will probably not be present in new data to which the model is applied.
Therefore, we may have overestimated the accuracy of our model.

In [34]:
trainData, validData = train_test_split(housing_df, test_size=0.40,random_state=1)

In [None]:
What is train_test_split?
train_test_split is a function in Sklearn model selection for splitting data arrays into two subsets: 
for training data and for testing data. With this function, you don't need to divide the dataset manually.
By default, Sklearn train_test_split will make random partitions for the two subsets. 
However, you can also specify a random state for the operation.

In [None]:
train_test_split(X, y, train_size=0.*,test_size=0.*, random_state=*)
X, y. The first parameter is the dataset you're selecting to use.
train_size. This parameter sets the size of the training dataset. There are three options: None, which is the default, Int, which requires the exact number of samples, and float, which ranges from 0.1 to 1.0.
test_size. This parameter specifies the size of the testing dataset. The default state suits the training size. It will be set to 0.25 if the training size is set to default.
random_state. The default mode performs a random split using np.random. Alternatively, you can add an integer using an exact number.

In [35]:
import sklearn.model_selection as model_selection

In [36]:
import sklearn.model_selection as model_selection
trainData, validData = model_selection.train_test_split(housing_df, test_size=0.40,random_state=1)

In [37]:
print("Training : ", trainData.shape)
print("Validation : ", validData.shape)
print()

Training :  (3481, 15)
Validation :  (2321, 15)



In [None]:
# training (50; and then splitting validation 40% and test 10%

In [38]:
# training (50
trainData, temp = model_selection.train_test_split(housing_df, test_size=0.5, random_state=1)

In [39]:
validData, testData = model_selection.train_test_split(temp, test_size=0.4, random_state=1)

In [40]:
print("Training : ", trainData.shape)
print("Test : ", testData.shape)

Training :  (2901, 15)
Test :  (1161, 15)


In [None]:
Cross-Validation
When the number of records in our sample is small, data partitioning might not be
advisable as each partition will contain too few records for model building and
performance evaluation. Furthermore, some data mining methods are sensitive to small
changes in the training data, so that a different partitioning can lead to different results.
An alternative to data partitioning is cross-validation, which is especially useful with
small samples. Cross-validation, or k-fold cross-validation, is a procedure that starts with
partitioning the data into “folds,” or non-overlapping subsamples. Often we choose k = 5
folds, meaning that the data are randomly partitioned into five equal parts, where each
fold has 20% of the observations. A model is then fit k times. Each time, one of the folds
is used as the validation set and the remaining k − 1 folds serve as the training set. The
result is that each fold is used once as the validation set, thereby producing predictions
for every observation in the dataset. We can then combine the model’s predictions on
each of the k validation sets in order to evaluate the overall performance of the model. In
Python, cross-validation is achieved using the cross_val_score() or the more general
cross_validate function, where argument cv determines the number of folds. Sometimes
cross-validation is built into a data mining algorithm, with the results of the crossvalidation
used for choosing the algorithm’s parameters