In [1]:
#Project: Import json to bigquery table
#StartDate: 4/21/2022
#EndDate: 
#Developer: Bradley, Hongquy, Khoa

In [2]:
from flask import Flask,render_template,request,redirect,url_for
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud import storage
import os, re, datetime, uuid, time, json
import google.cloud.logging
import apache_beam as beam
from io import StringIO
import argparse
from apache_beam.options.pipeline_options import PipelineOptions
from werkzeug.utils import secure_filename

In [3]:
#this is to allow a authenticated user on the project to work on bigquerry
Key_path = r"C:\\Users\Brad\Documents\model-craft-342921-ea36cdb339e7.json"

#this is needed to request a job
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = Key_path
credentials = service_account.Credentials.from_service_account_file(Key_path,scopes=["https://www.googleapis.com/auth/cloud-platform"])
client=bigquery.Client(credentials=credentials,project=credentials.project_id)

#this is needed to allow the request to access gcp cloud storage
storage_client=storage.Client(credentials=credentials,project=credentials.project_id)

#this is where a temporary file is stored to be accessed by the cloud
bucket = storage_client.get_bucket("data_intake4062022")

#this is where the column location and message from bigquery is stored to be accessed later
bucket1 = storage_client.get_bucket("error_messages4142022")

#this is where the rows attempted to be inserted into bigquery is stored to be accessed later
bucket2 = storage_client.get_bucket("practice_error_logs")

bucket3 = storage_client.get_bucket("job_details_4192022")

#this is needed to access logs from the bigquery job
client1 = google.cloud.logging.Client()
logger = client1.logger(name="dataflow.googleapis.com%2Fworker")

In [4]:
#this is a function created to format the input data into a dictionary so the data can be easily inserted into bigquery
class formating(beam.DoFn):
    #table_sch is a side input that was found for the table we are trying to insert the data into
    def process(self, element, table_sch, job_name):
        import apache_beam as beam
        record = {}
        record['job_id'] = job_name
        for x in range(0, len(element)):
            name = table_sch[x]['name']
            record[name] = element[x]
        yield record

In [5]:
#this function is for the second pipeline and will merge the message from bigquery with its respective input data
class process_message(beam.DoFn):
    #bad_input_data is a side input that contains all the bad inserts from bigquery
    #Total is a side input that will be used for numbering the error message
    def process(self, element, bad_input_data, count):
        import apache_beam as beam
        from io import StringIO
        import json, re
        #these few steps are required to format the data
        bad_message = element.replace("'",'"')
        bad_message = json.loads(bad_message)
        #the last steps take an element from the pipeline, which is the message and column location, and match it with the side input
        for bad_message_row in bad_message['elements']:
            for input_data in bad_input_data['elements']:
                message = ''
                if input_data[bad_message_row['location']] == bad_message_row['message'].split(': ')[1]:
                    count = count+1
                    #this is the final message that will be seen by the user
                    message = 'Error #{} - Message: {} in column {} - Input: {}\n'.format(count, bad_message_row['message'],bad_message_row['location'],input_data)
                    #this will remove the used inputed row data so it wont be used more than once
                    bad_input_data['elements'].remove(input_data)
                    yield message

In [6]:
class json_ext(beam.DoFn):
    def process(self, element, job_name):
        import apache_beam as beam
        from io import StringIO
        import json, re
        json_row = element.replace("'",'"')
        json_row = json.loads(json_row)
        for row_data in json_row['elements']:
            row_data['job_id'] = job_name
            yield row_data

In [7]:
#this is needed to run the website html
app = Flask(__name__)

#secret key is to allow this python code to move variable from the frontend to the backend
app.secret_key = "35367565"

@app.route('/')
def form():
    return redirect(url_for('upload'))

#This will render the frontend of the website to get the json file
@app.route('/upload', methods=['GET','POST'])
def upload():

    #resets the message so that previous messages dont confuse the user
    message = None
    
    #this will find all the tables being used in this project and display them as options to be used in bigquery
    tables = ['Task3'] #change--------------------------
    
    #this creates a unique job name for dataflow
    job_name = 'pythontobigquerry-{}'.format(uuid.uuid4())

    beam_options = PipelineOptions(
                                runner = 'DataflowRunner',
                                #runner='DirectRunner',
                                project = 'model-craft-342921',
                                job_name = '{}'.format(job_name),
                                temp_location = 'gs://data_intake4062022/temp1',
                                region='us-east1',
                                service_account_email = 'practice-py@model-craft-342921.iam.gserviceaccount.com'
                            )
    
    #this is needed to get the error logs from the bigquery in dataflow job
    filter_str = (
        f'resource.labels.job_name={job_name}'
        f' resource.type="dataflow_step"'
        f' AND severity >= ERROR'
    )
    
    
    #This will only run if the user attempts to submit a file
    if request.method == 'POST':

        #this will only run if what the user submitted is a file
        if 'file' in request.files:
            #this gets the file data
            file = request.files['file']
            #this aquires the name of the file
            filename = secure_filename(file.filename)
            
            #this will check it the file has data to be processed
            if len(file.readlines()) == 0:
                message = "File has no data to process"
            else:
                file.seek(0)
                try:
                    #this will only run if the file is a csv
                    if filename.endswith('.json'):
                        
                        #these two lines are to upload the data fetched from the front-end to gcp cloud storage
                        blob = bucket.blob('data.json')
                        blob.upload_from_file(file, timeout=3600)
                        
                        
                        #this gets the table wanting to be used from the front end
                        table_id = request.form.getlist('checks')[0]
                        SCHEMA = {}
                        table_sch = []
                        #in this try block it is attempting to get the table schema. its a side input
                        try:
                            SchemaJob = client.get_table('model-craft-342921.testing.{}'.format(table_id))
                            for s in SchemaJob.schema:
                                new_dict = {}
                                new_dict['name'] = s.name
                                new_dict['type'] = s.field_type
                                new_dict['mode'] = s.mode
                                table_sch.append(new_dict)
                            SCHEMA['fields'] = table_sch

                        except Exception as e:
                            print(e)
                        #these two lines create a unique name for the files being saved to gcp cloud storage. files with the same name but in differnt buckets are tied together
                        filename_helper = str(datetime.datetime.now())
                        filename = "LOG: "+ filename_helper + ".txt"
                        
                        #in this try block, it will attempted to create the first pipeline that will read the input data and try to write it to bigquery. all failed rows with be stored into the gcp cloud storage
                        try:
                            p = beam.Pipeline(options=beam_options)
                            events  = (p | 'ReadData' >> beam.io.ReadFromText('gs://data_intake4062022/data.json')
                                   | 'json extract' >> beam.ParDo(json_ext(), job_name)
                                    #| 'format to dict2' >> beam.ParDo(formating(),table_sch, job_name) 
                                   | 'WriteToBigQuery2' >>  beam.io.gcp.bigquery.WriteToBigQuery(
                                       'model-craft-342921:testing.{}'.format(table_id),
                                       schema=SCHEMA,
                                       write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                                       insert_retry_strategy=beam.io.gcp.bigquery_tools.RetryStrategy.RETRY_NEVER,
                                       method='STREAMING_INSERTS')
                                    )
                            (events[beam.io.gcp.bigquery.BigQueryWriteFn.FAILED_ROWS]
                                    | "Bad lines" >> beam.io.WriteToText('gs://practice_error_logs/{}'.format(filename), shard_name_template = "")
                            )
                            
                            result = p.run()
                            result.wait_until_finish()
                            
                        except Exception as error:
                            print('This was the error: ', error)
                            message = "Error setting up the data ingestion pipeline"
                        
                    #If the file is not a json or the csv this will run
                    else:
                        message = "File type is not excepted"
                    #endif
                except Exception as error:
                    print('This was the error: ', error)
                    message = "There was an error in creating the request"
            #endif
        #This will run if the submition is not a file type
        elif 'file' not in request.files:
            message = "There was no file to upload"
        #endif
    #endif

    #this will render the template on the website
    return render_template("front.html", message = message, tables = tables)

In [None]:
if __name__ == '__main__':
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [21/Apr/2022 13:47:47] "[32mGET / HTTP/1.1[0m" 302 -
127.0.0.1 - - [21/Apr/2022 13:47:47] "[37mGET /upload HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [21/Apr/2022 13:47:47] "[37mGET /upload HTTP/1.1[0m" 200 -


  is_streaming_pipeline = p.options.view_as(StandardOptions).streaming
127.0.0.1 - - [21/Apr/2022 13:53:25] "[37mPOST /upload HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [21/Apr/2022 13:53:25] "[37mPOST /upload HTTP/1.1[0m" 200 -
