In [3]:
USER_FLAG = "--user"

In [4]:
!pip3 install {USER_FLAG} google-cloud-aiplatform==1.7.0
!pip3 install {USER_FLAG} kfp==1.8.9



In [2]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

In [None]:
!python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"

In [2]:
!pip list | grep aiplatform

google-cloud-aiplatform               1.7.0


In [3]:
import os

In [39]:
PROJECT_ID = "mlopts-clustering"
BUCKET_NAME="gs://" + PROJECT_ID + "-bucket"

In [40]:
import matplotlib.pyplot as plt
import pandas as pd

from kfp.v2 import compiler, dsl
from kfp.v2.dsl import pipeline, component, Artifact, Dataset, Input, Metrics, Model, Output, InputPath, OutputPath

from google.cloud import aiplatform

# We'll use this namespace for metadata querying
from google.cloud import aiplatform_v1

In [42]:
PATH=%env PATH
%env PATH={PATH}:/home/jupyter/.local/bin
REGION="us-west1"

PIPELINE_ROOT = f"{BUCKET_NAME}/pipeline_root/"
PIPELINE_ROOT

env: PATH=/usr/local/cuda/bin:/opt/conda/bin:/opt/conda/condabin:/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games:/home/jupyter/.local/bin:/home/jupyter/.local/bin:/home/jupyter/.local/bin:/home/jupyter/.local/bin


'gs://mlopts-clustering-bucket/pipeline_root/'

In [43]:
import pandas as pd
import numpy as np
# from google.colab import files
import matplotlib.pyplot as plt
import io
import requests
%matplotlib inline
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import StandardScaler
from scipy import stats
# from google.colab import drive
import csv  
import seaborn as sns

In [45]:
#Create a Component to Load Base data
@component(
    packages_to_install=["pandas","requests","bs4","sklearn"],
    base_image="python:3.9",
    output_component_file="create_dataset.yaml"
)
def load_data(base_ds: Output[Dataset], school_ds: Output[Dataset]):
    import pandas as pd
    base_dataset = pd.read_csv('https://storage.googleapis.com/mlpps-clustering-bingecoders/data/Dataset_TX_1.csv')
    school_dataset = pd.read_csv('https://storage.googleapis.com/mlpps-clustering-bingecoders/data/school_dataset.csv')
    base_dataset.to_csv(base_ds.path)
    school_dataset.to_csv(school_ds.path)    

In [66]:
#Create component for data preparation
@component(
    packages_to_install=["pandas", "requests", "bs4"],
    base_image="python:3.9",
    output_component_file="prepare_dataset.yaml"
)
def prepare_data(bd: Input[Dataset], sd: Input[Dataset], updated_dataset: Output[Dataset]):
    import pandas as pd
    import requests
    import bs4
    import csv
    
    base_dataset = pd.read_csv(bd.path)
    school_dataset = pd.read_csv(sd.path)
    
    base_dataset = base_dataset.drop(['latitude','longitude', 'currency', 'land_area','rent_zestimate', 'days_on_zillow', 'sold_date', 'is_zillow_owned', 'image', 'broker_name',
                                  'input', 'property_url', 'listing_url', 'property_id', 'rank'],axis='columns')

    base_dataset.head(5)
    
    base_dataset['zipcode'] = base_dataset['address'].str.split(',').str[2].str.split().str[1]
    base_dataset['state'] = base_dataset['address'].str.split(',').str[2].str.split().str[0]
    base_dataset
    
    base_dataset = base_dataset.loc[base_dataset['state'] == 'TX']
    base_dataset.head(5)
    
    base_dataset.shape
    
    base_dataset = base_dataset.drop(['address'],axis='columns')
    base_dataset.head(2)
    
    base_dataset['area'] = base_dataset['area'].str.split().str[0]
    base_dataset.head(2)
    
    base_dataset = base_dataset.drop(columns={'state'})
    
    base_dataset.isnull().sum()
    
    base_dataset = base_dataset.dropna(subset=['bedrooms', 'bathrooms', 'price', 'area'])
    base_dataset.shape
    
    base_dataset['zipcode'] = base_dataset['zipcode'].astype(int)
    base_dataset['bathrooms'] = base_dataset['bathrooms'].astype(int)
    base_dataset['bedrooms'] = base_dataset['bedrooms'].astype(int)
    base_dataset['price'] = base_dataset['price'].astype(int)
    base_dataset['area'] = base_dataset['area'].astype(float)
    base_dataset['area'] = base_dataset['area'].astype(int)
    
    base_dataset.shape
    
    school_dataset.head(5)
    
    school_data = school_dataset[['School Zip', 'School Enrollment as of Oct 2020']]
    school_data.head(10)
    
    school_data['School Zip'] = school_data['School Zip'].apply(lambda x: x.split('-')[0])
    school_data
    
    school_data = school_data.rename(columns={"School Zip":"zipcode" , "School Enrollment as of Oct 2020":"ranking"})
    school_data.head(10)
    
    df_school = school_data.groupby(['zipcode']).agg(
    school_ranking = pd.NamedAgg(column="ranking", aggfunc=sum),
    zipcode =  pd.NamedAgg(column="zipcode", aggfunc=max)
    )

    df_school.reset_index(drop=True, inplace=True)
    df_school
    
    df_school = df_school.sort_values(['school_ranking'], ascending=[False])
    df_school.head(5)
    
    df_school['school_rank'] = range(1, 1+len(df_school))
    df_school = df_school.drop(columns=['school_ranking'])
    df_school.head(5)
    
    df_school['school_rank'] = df_school['school_rank'].astype(int)
    df_school['zipcode'] = df_school['zipcode'].astype(int)
    df_school.dtypes
    
    df_school.isnull().sum()
    df_school.shape
    
    header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36',
    'referer': 'https://www.zillow.com/tx/house,land,townhouse_type/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-122.9723493125%2C%22east%22%3A-77.1813336875%2C%22south%22%3A26.21832355280932%2C%22north%22%3A36.15897703463119%7D%2C%22mapZoom%22%3A5%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A54%2C%22regionType%22%3A2%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22con%22%3A%7B%22value%22%3Afalse%7D%2C%22apco%22%3A%7B%22value%22%3Afalse%7D%2C%22apa%22%3A%7B%22value%22%3Afalse%7D%2C%22manu%22%3A%7B%22value%22%3Afalse%7D%2C%22mf%22%3A%7B%22value%22%3Afalse%7D%7D%2C%22isListVisible%22%3Atrue%7D'
    }
    url = 'https://www.zillow.com/tx/house,land,townhouse_type/'
    html = requests.get(url=url,headers=header)
    html.status_code
    
    bsobj = bs4.BeautifulSoup(html.content,'html.parser')
    price_list = []

    for price in bsobj.findAll('div',{'class':'list-card-heading'}):
        if(len(price)!=0):        
            price_list.append(price.text)                                
              
    price_list
    address = []
    var = 'list-card-addr'
    for adr in bsobj.findAll('address',{'class':'list-card-addr'}):
        x = adr.text
        sp = x.split(',')
        sp2 = sp[2].split()
        address.append(sp2[1])

    interim_list = []
    for i in range(len(price_list)):
        strval = price_list[i]
        add = address[i]
        s1 = strval.split('-')
        s2 = s1[0].split()
        s_price = s2[0][:-1]
        s1_price = s_price.replace(',','')
        s_bed = s2[0][-1]
        s_bath = s2[1][-1]
        s_sqft = s2[2][2:]
        s1_sqft = s_sqft.replace(',','')
  
        strval = [s1_price, s_bed,  s_bath, s1_sqft, s1[1],add]
        interim_list.append(strval)    
    interim_list
    header = ['price', 'bed', 'bath', 'sqft', 'listing_type','address']

    with open('scrape_1.csv', 'w', encoding='UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(header)
        writer.writerows(interim_list)

    scraped_1 = pd.read_csv('scrape_1.csv')
    scraped_1
    
    header1 = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36',
        'referer': 'https://www.zillow.com/tx/townhomes/?'
    }
    url1 = 'https://www.zillow.com/tx/townhomes/?'
    html1 = requests.get(url=url1,headers=header1)
    html1.status_code
    
    bsobj1 = bs4.BeautifulSoup(html1.content,'html.parser')
    price_list1 = []

    for price in bsobj1.findAll('div',{'class':'list-card-heading'}):        
        if(len(price)!=0):            
            price_list1.append(price.text)            
    price_list1
    
    address1 = []
    var = 'list-card-addr'
    for adr in bsobj1.findAll('address',{'class':'list-card-addr'}):
        x = adr.text
        sp = x.split(',')
        sp2 = sp[2].split()
        address1.append(sp2[1])

    interim_list1 = []
    for i in range(len(price_list1)):        
        strval = price_list1[i]
        add = address1[i]
        s1 = strval.split('-')
        s2 = s1[0].split()
        s_price = s2[0][:-1]
        s1_price = s_price.replace(',','')
        s_bed = s2[0][-1]
        s_bath = s2[1][-1]
        s_sqft = s2[2][2:]
        s1_sqft = s_sqft.replace(',','')
  
        strval = [s1_price, s_bed,  s_bath, s1_sqft, s1[1],add]
        interim_list1.append(strval)    
    interim_list1
    
    header = ['price', 'bed', 'bath', 'sqft', 'listing_type','address']

    with open('scrape_2.csv', 'w', encoding='UTF8') as f:        
        writer = csv.writer(f)    
        writer.writerow(header)
        writer.writerows(interim_list1)        

    scraped_2 = pd.read_csv('scrape_2.csv')
    scraped_2
    
    header2 = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36',
        'referer': 'https://www.zillow.com/tx/condos/?'
    }
    url2 = 'https://www.zillow.com/tx/condos/?'
    html2 = requests.get(url=url2,headers=header2)
    html2.status_code
    
    bsobj2 = bs4.BeautifulSoup(html2.content,'html.parser')
    
    price_list2 = []

    for price in bsobj2.findAll('div',{'class':'list-card-heading'}):        
        if(len(price)!=0):            
            price_list2.append(price.text)                          
    price_list2
    
    address2 = []

    for adr in bsobj2.findAll('address',{'class':'list-card-addr'}):
        x = adr.text  
        sp = x.split(',')
        sp2 = sp[2].split()
        address2.append(sp2[1])

    interim_list2 = []
    # print(price_list)
    for i in range(len(price_list2)):
        strval = price_list2[i]
        add = address2[i]
        s1 = strval.split('-')
        s2 = s1[0].split()
        s_price = s2[0][:-1]
        s1_price = s_price.replace(',','')
        s_bed = s2[0][-1]
        s_bath = s2[1][-1]
        s_sqft = s2[2][2:]
        s1_sqft = s_sqft.replace(',','')
  
        strval = [s1_price, s_bed,  s_bath, s1_sqft, s1[1],add]
        interim_list2.append(strval)
    
    interim_list2
    header = ['price', 'bed', 'bath', 'sqft', 'listing_type','address']

    with open('scraped_3.csv', 'w', encoding='UTF8') as f:
        writer = csv.writer(f)    
        writer.writerow(header)
        writer.writerows(interim_list2)
    scraped_3 = pd.read_csv('scraped_3.csv')
    scraped_3
    
    scraped_data = pd.concat([scraped_1,scraped_2,scraped_3],axis=0)
    scraped_data.head(2)
    
    scraped_data = scraped_data.reset_index(drop=True)
    
    scraped_data['price'] = scraped_data['price'].str.split('$').str[1]
    scraped_data['price'] = scraped_data['price'].astype(int)
    
    scraped_data = scraped_data.rename(columns={"bed":"bedrooms" , "bath":"bathrooms","sqft":"area","address":"zipcode"})
    scraped_data.shape
    
    scraped_data.isnull().sum()
    scraped_data.dtypes
    
    base_scraped = pd.concat([base_dataset, scraped_data],axis=0)
    base_scraped
    
    base_scraped['price_per_sqft'] = base_scraped['price']/base_scraped['area']
    base_scraped
    
    base_scraped = base_scraped.reset_index(drop=False)
    
    (base_scraped.price_per_sqft > 600).sum()
    base_scraped.index.duplicated()
    
    base_scraped = base_scraped[base_scraped.price_per_sqft < 600]
    
    merged_df = pd.merge(base_scraped, df_school, on='zipcode', how="left")
    merged_df
    
    merged_df.loc[merged_df['zipcode'] == 77091]
    
    merged_df['school_rank'].isnull().sum()
    merged_df.shape
    
    merged_df = merged_df.dropna(subset=['school_rank'])
    merged_df.shape
    
    merged_df.to_csv(updated_dataset.path)
    

@component(packages_to_install=["pandas", "sklearn","joblib"],
    base_image="python:3.9",
    output_component_file="train_model.yaml")
def train_model(
    dataset: Input[Dataset],
    model_sklearn: Output[Model]
):
    import pandas as pd
    from sklearn.linear_model import LinearRegression
    from joblib import dump
    
    reg_data = pd.read_csv(dataset.path)
    reg_algo = LinearRegression()
    reg_algo.fit(reg_data[['school_rank','area']], reg_data.price)
    arr1 = reg_algo.coef_
    c1 = arr1[0]
    c2 = arr1[1]
    arr1
    
    # model_artifact.metadata["train_score"] = float(score)
    model_sklearn.metadata["framework"] = "Linear Regression"
    
    path = model_sklearn.path.replace("model_sklearn", "model")
    dump(reg_algo, path + ".joblib")    

In [67]:
@component(
    packages_to_install=["google-cloud-aiplatform"],
    base_image="python:3.9",
    output_component_file="beans_deploy_component.yaml",
)
def deploy_model(
    model: Input[Model],
    vertex_endpoint: Output[Artifact],
    vertex_model: Output[Model]
):
    from google.cloud import aiplatform
        
    project = 'mlopts-clustering'
    region = "us-west1"
    aiplatform.init(project=project, location=region)

    deployed_model = aiplatform.Model.upload(
        display_name="property-model-pipeline",
        artifact_uri = model.uri.replace("model_sklearn", ""),
        serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.0-24:latest"
    )
    endpoint = deployed_model.deploy(machine_type="n1-standard-4")

    # Save data to the output params
    vertex_endpoint.uri = endpoint.resource_name
    vertex_model.uri = deployed_model.resource_name

In [68]:
@dsl.pipeline(
    pipeline_root=PIPELINE_ROOT,
    # A name for the pipeline. Use to determine the pipeline Context.
    name="pipeline-test-1",
)
def pipeline():
    dataset_op = load_data()
    prepare_op = prepare_data(dataset_op.outputs["base_ds"], dataset_op.outputs["school_ds"])
    train_op = train_model(prepare_op.outputs["updated_dataset"])
    deploy_op = deploy_model(model=train_op.outputs["model_sklearn"])

In [69]:
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='property_pipe.json')

In [70]:
from kfp.v2.google.client import AIPlatformClient

api_client = AIPlatformClient(
                project_id="mlopts-clustering",
                region="us-west1"
                )

response = api_client.create_run_from_job_spec(
    'property_pipe.json',
)