Create procedure that trains models

In [1]:
DROP PROCEDURE IF EXISTS generate_model;
GO

CREATE PROCEDURE generate_model (@trained_model varbinary(max) OUTPUT)
AS 
BEGIN    
    EXECUTE sp_execute_external_script
            @language = N'Python',
            @script = N'
import pandas
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pickle

raw_dataframe = input_data

# Data pre-processing
raw_dataframe.town = pandas.factorize(raw_dataframe.town)[0]
raw_dataframe.flat_type = pandas.factorize(raw_dataframe.flat_type)[0]
raw_dataframe.flat_model = pandas.factorize(raw_dataframe.flat_model)[0]

# Data range refinement
town_value = 0
month_value = 1
year_value = 2017
refined_df = raw_dataframe.loc[ (raw_dataframe[''town''] == town_value) &
                              # (raw_dataframe[''month''] == month_value) &
                                (raw_dataframe[''year''] == year_value) ]
dependent_variable = [''resale_price''] 
dependent_dataset = refined_df[dependent_variable]

# Define dependent and independent variables
independent_variables = [''town'',''flat_type'',''floor_area_sqm'',''flat_model'',''remaining_lease_months''] #input any number of variables
dependent_variable = [''resale_price''] #only input one variable

# Split data into dependent and independent 
independent_dataset = refined_df[independent_variables]
dependent_dataset = refined_df[dependent_variable]

# Further splitting dataset into training and testing subsets
test_ratio=0.2 #splits the data into testing and training sets with ratio 0.2
indp_train_set, indp_test_set, dep_train_set, dep_test_set = train_test_split(independent_dataset, 
                                                                              dependent_dataset, 
                                                                              test_size=test_ratio)


# Create model object & train model
linear_model = LinearRegression()
linear_model.fit(indp_train_set, dep_train_set)

trained_model = pickle.dumps(linear_model)
'
, @input_data_1 = N'
SELECT
[dbo].[hdbAddress].[id], [dbo].[hdbAddress].[year], [dbo].[hdbAddress].[month], [dbo].[hdbAddress].[date],
[dbo].[hdbAddress].[town], [dbo].[hdbAddress].[flat_type], [dbo].[hdbAddress].[block], [dbo].[hdbAddress].[street_name],
[dbo].[hdbAddress].[storey_range], [dbo].[hdbAddress].[floor_area_sqm], [dbo].[hdbAddress].[flat_model],
[dbo].[remainingLease].[lease_commence_date], [dbo].[remainingLease].[remaining_lease], [dbo].[remainingLease].[remaining_lease_months],
[hdb_database_1].[resalePrices].[resale_price]
FROM [dbo].[hdbAddress]
JOIN [dbo].[remainingLease]
ON [dbo].[hdbAddress].[id] = [dbo].[remainingLease].[id]
JOIN [hdb_database_1].[resalePrices]
ON [dbo].[hdbAddress].[id] = [hdb_database_1].[resalePrices].[id]'
, @input_data_1_name = N'input_data'
, @params = N'@trained_model varbinary(max) OUTPUT'
, @trained_model = @trained_model OUTPUT;
END; 
GO

Create a table to store trained models

In [2]:
Use externalDB;
DROP TABLE IF EXISTS dbo.hdb_py_models;
GO
CREATE TABLE dbo.hdb_py_models(
    model_name VARCHAR(30) NOT NULL DEFAULT('default model') PRIMARY KEY,
    model VARBINARY(MAX) NOT NULL
);
GO

Execute the stored procedure and store the resultant model into the table

In [3]:
-- Execute if model has not been created
DECLARE @model VARBINARY(MAX);
EXECUTE generate_model @model OUTPUT;

INSERT INTO [dbo].[hdb_py_models] (model_name, model) VALUES ('linear_model', @model)

In [None]:
--Execute if model has already been created
DECLARE @model VARBINARY(MAX);
EXECUTE generate_py_model @model OUTPUT;

UPDATE [dbo].[hdb_py_models] SET model_name = 'linear_model', model = @model WHERE model_name = 'linear_model'

Retrieve the model from the table

In [4]:
SELECT model FROM dbo.hdb_py_models WHERE model_name = 'linear_model'

model
0x800363736B6C6561726E2E6C696E6561725F6D6F64656C2E626173650A4C696E65617252656772657373696F6E0A7100298171017D710228580D0000006669745F696E7465726365707471038858090000006E6F726D616C697A657104895806000000636F70795F5871058858060000006E5F6A6F627371064E5805000000636F65665F7107636E756D70792E636F72652E6D756C746961727261790A5F7265636F6E7374727563740A7108636E756D70790A6E6461727261790A71094B0085710A430162710B87710C52710D284B014B014B0286710E636E756D70790A64747970650A710F5802000000663871104B004B01877111527112284B0358010000003C71134E4E4E4AFFFFFFFF4AFFFFFFFF4B0074711462894310BAA5C2AA4ACDCB40B0B72649DDA8654071157471166258090000005F72657369647565737117680868094B00857118680B87711952711A284B014B0185711B68128943084EF8D10316B0A642711C74711D62580500000072616E6B5F711E4B02580900000073696E67756C61725F711F680868094B00857120680B877121527122284B014B02857123681289431043149303F8EF04417978932A93065140712474712562580A000000696E746572636570745F7126680868094B00857127680B877128527129284B014B0185712A6812894308D024E6C075AC15C1712B74712C6258100000005F736B6C6561726E5F76657273696F6E712D5806000000302E32302E32712E75622E


Create a stored procedure that retrieves the stored model and predicts a set values

In [8]:
DROP PROCEDURE IF EXISTS py_predict_hdb;
GO
CREATE PROCEDURE py_predict_hdb (@model varchar(100))   
AS
BEGIN
    DECLARE @py_model varbinary(max) = (SELECT model FROM dbo.hdb_py_models WHERE model_name = @model)

    EXECUTE sp_execute_external_script
        @language = N'Python',
        @script = N'    
import pandas
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pickle

raw_dataframe = input_data
trained_model = pickle.loads(py_model)

# Data pre-processing
raw_dataframe.town = pandas.factorize(raw_dataframe.town)[0]
raw_dataframe.flat_type = pandas.factorize(raw_dataframe.flat_type)[0]
raw_dataframe.flat_model = pandas.factorize(raw_dataframe.flat_model)[0]

# Data range refinement
town_value = 0
month_value = 1
year_value = 2017
refined_df = raw_dataframe.loc[ (raw_dataframe[''town''] == town_value) &
                              # (raw_dataframe[''month''] == month_value) &
                                (raw_dataframe[''year''] == year_value) ]
dependent_variable = [''resale_price''] 
dependent_dataset = refined_df[dependent_variable]

# Define dependent and independent variables
independent_variables = [''town'',''flat_type'',''floor_area_sqm'',''flat_model'',''remaining_lease_months''] #input any number of variables
dependent_variable = [''resale_price''] #only input one variable

# Split data into dependent and independent 
independent_dataset = refined_df[independent_variables]
dependent_dataset = refined_df[dependent_variable]

# Further splitting dataset into training and testing subsets
test_ratio=0.2 #splits the data into testing and training sets with ratio 0.2
indp_train_set, indp_test_set, dep_train_set, dep_test_set = train_test_split(independent_dataset, 
                                                                              dependent_dataset, 
                                                                              test_size=test_ratio)

linear_predictions = trained_model.predict(independent_dataset)     
predictions_dataframe = pandas.DataFrame(linear_predictions)

OutputDataSet = predictions_dataframe
'
, @input_data_1 = N' 
SELECT
[dbo].[hdbAddress].[id], [dbo].[hdbAddress].[year], [dbo].[hdbAddress].[month], [dbo].[hdbAddress].[date],
[dbo].[hdbAddress].[town], [dbo].[hdbAddress].[flat_type], [dbo].[hdbAddress].[block], [dbo].[hdbAddress].[street_name],
[dbo].[hdbAddress].[storey_range], [dbo].[hdbAddress].[floor_area_sqm], [dbo].[hdbAddress].[flat_model],
[dbo].[remainingLease].[lease_commence_date], [dbo].[remainingLease].[remaining_lease], [dbo].[remainingLease].[remaining_lease_months],
[hdb_database_1].[resalePrices].[resale_price]
FROM [dbo].[hdbAddress]
JOIN [dbo].[remainingLease]
ON [dbo].[hdbAddress].[id] = [dbo].[remainingLease].[id]
JOIN [hdb_database_1].[resalePrices]
ON [dbo].[hdbAddress].[id] = [hdb_database_1].[resalePrices].[id] '
, @input_data_1_name = N'input_data'
, @params = N'@py_model varbinary(max)'
, @py_model = @py_model
WITH RESULT SETS ( ([predicted_value] INT NOT NULL) )

END;
GO

In [9]:
DROP TABLE IF EXISTS [dbo].[py_hdb_predictions]
GO

CREATE TABLE [dbo].[py_hdb_predictions](
    [ID][INT] IDENTITY(1,1) NOT NULL,
    [predicted_hdb_count] [INT] NOT NULL
) ON [PRIMARY]
GO

In [10]:
INSERT INTO [dbo].[py_hdb_predictions]
EXEC py_predict_hdb 'linear_model'

: Msg 39004, Level 16, State 20, Line 2
A 'Python' script error occurred during execution of 'sp_execute_external_script' with HRESULT 0x80004004.

: Msg 39019, Level 16, State 2, Line 2
An external script error occurred: 

Error in execution.  Check the output for more information.

In [None]:
SELECT [dbo].[py_hdb_predictions].[predicted_value]
FROM [dbo].[py_hdb_predictions]


In [None]:
SELECT
[dbo].[hdbAddress].[id],
[dbo].[hdbAddress].[town], [dbo].[hdbAddress].[year],
[hdb_database_1].[resalePrices].[resale_price]
FROM [dbo].[hdbAddress]
JOIN [hdb_database_1].[resalePrices]
ON [dbo].[hdbAddress].[id] = [hdb_database_1].[resalePrices].[id]
WHERE [dbo].[hdbAddress].[year] = 2017 AND [dbo].[hdbAddress].[town] = 'ANG MO KIO'
ORDER BY [dbo].[hdbAddress].[id] ASC