In [1]:
#Project: Import csv to bigquery table
#StartDate: 4/06/2022
#EndDate: 
#Developer: Bradley, Hongquy, Khoa

In [2]:
from flask import Flask,render_template,request,redirect,url_for
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud import storage
import os, re, datetime, uuid, time, json
import google.cloud.logging
import apache_beam as beam
from io import StringIO
import argparse
from apache_beam.options.pipeline_options import PipelineOptions
from werkzeug.utils import secure_filename

In [3]:
project = 'model-craft-342921'
temp_location = 'gs://data_intake4062022/temp1'
region='us-east1'
service_account_email = 'practice-py@model-craft-342921.iam.gserviceaccount.com'
dataset = 'testing'
data_input = 'gs://data_intake4062022/data.csv'
message_ouput = 'gs://complete_message_4182022'
bad_rows_input = 'gs://practice_error_logs'
local_file_loc = r"C:\\Users\Brad\temp.txt"
error_message = 'gs://error_messages4142022'

In [4]:
#this is to allow a authenticated user on the project to work on bigquerry
Key_path = r"C:\\Users\Brad\Documents\model-craft-342921-ea36cdb339e7.json"

#this is needed to request a job
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = Key_path
credentials = service_account.Credentials.from_service_account_file(Key_path,scopes=["https://www.googleapis.com/auth/cloud-platform"])
client=bigquery.Client(credentials=credentials,project=credentials.project_id)

#this is needed to allow the request to access gcp cloud storage
storage_client=storage.Client(credentials=credentials,project=credentials.project_id)

#this is where a temporary file is stored to be accessed by the cloud
bucket = storage_client.get_bucket("data_intake4062022")

#this is where the column location and message from bigquery is stored to be accessed later
bucket1 = storage_client.get_bucket("error_messages4142022")

#this is where the rows attempted to be inserted into bigquery is stored to be accessed later
bucket2 = storage_client.get_bucket("practice_error_logs")

bucket3 = storage_client.get_bucket("complete_message_4182022")

#this is needed to access logs from the bigquery job
client1 = google.cloud.logging.Client()
logger = client1.logger(name="dataflow.googleapis.com%2Fworker")

In [5]:
#this is a function created to format the input data into a dictionary so the data can be easily inserted into bigquery
class formating(beam.DoFn):
    #table_sch is a side input that was found for the table we are trying to insert the data into
    def process(self, element, table_sch, job_name):
        import apache_beam as beam
        record = {}
        record['job_id'] = job_name
        for x in range(0, len(element)):
            if x < len(table_sch)-1:
                name = table_sch[x]['name']
                if element[x] == '':
                    record[name] = None
                else:
                    record[name] = element[x]
            else:
                record['unknown{}'.format(x)] = element[x]
        yield record

In [6]:
class formating2(beam.DoFn):
    def process(self, element, table_id):
        import apache_beam as beam
        element = element[1]
        element.pop('job_id', None)
        yield element

In [7]:
def schema_fetch(table_id):
    SCHEMA = {}
    table_sch = []
    #in this try block it is attempting to get the table schema. its a side input
    try:
        SchemaJob = client.get_table('{}.{}.{}'.format(project,dataset,table_id))
        for s in SchemaJob.schema:
            new_dict = {}
            new_dict['name'] = s.name
            new_dict['type'] = s.field_type
            new_dict['mode'] = s.mode
            table_sch.append(new_dict)
        SCHEMA['fields'] = table_sch
        return SCHEMA
    except Exception as e:
        print(e)

In [8]:
def main_pipeline(beam_options,SCHEMA, job_name,filename,table_id):
    table_sch = SCHEMA['fields']
    p = beam.Pipeline(options=beam_options)              
    events  = (p | 'ReadData' >> beam.io.ReadFromText(data_input, skip_header_lines =1)
           | 'Split' >> beam.Map(lambda x: x.split(','))
            | 'format to dict2' >> beam.ParDo(formating(),table_sch, job_name)
            | 'WriteToBigQuery2' >>  beam.io.gcp.bigquery.WriteToBigQuery(
               '{}:{}.{}'.format(project,dataset,table_id),
               schema=SCHEMA,
               write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                batch_size = 100000,
               insert_retry_strategy=beam.io.gcp.bigquery_tools.RetryStrategy.RETRY_NEVER,
               method='STREAMING_INSERTS')
            )
    (events[beam.io.gcp.bigquery.BigQueryWriteFn.FAILED_ROWS]
             | "remove tuple" >> beam.ParDo(formating2(),table_id)
            | "Bad lines" >> beam.io.WriteToText('{}/{}.txt'.format(bad_rows_input,filename), append_trailing_newlines=True, shard_name_template = "")
    )
    result = p.run()
    result.wait_until_finish()

In [9]:
def get_errors(filename):
    blob = bucket2.blob('{}.txt'.format(filename))
    downloaded_blob = blob.download_as_string()
    log_message = downloaded_blob.decode("utf-8", "ignore")
    log_message = log_message.replace("'",'"')
    log_message = log_message.replace("None", "null")
    bad_list =log_message.split('\n')
    bad_json = []
    bad_input_data = {}
    for row in bad_list:
        if row != '':
            j = json.loads(row)
            bad_json.append(j)
    bad_input_data['elements'] = bad_json
    return bad_input_data

In [10]:
#this is needed to run the website html
app = Flask(__name__)

#secret key is to allow this python code to move variable from the frontend to the backend
app.secret_key = "35367565"

@app.route('/')
def form():
    return redirect(url_for('upload'))

#This will render the frontend of the website to get the json file
@app.route('/upload', methods=['GET','POST'])
def upload():

    #resets the message so that previous messages dont confuse the user
    message = None
    
    #this will find all the tables being used in this project and display them as options to be used in bigquery
    tables = ['Task3'] #change--------------------------
    
    #this creates a unique job name for dataflow
    job_name = 'pythontobigquerry-{}'.format(uuid.uuid4())

    beam_options = PipelineOptions(
                                runner = 'DataflowRunner',
                                #runner='DirectRunner',
                                project = project,
                                job_name = '{}'.format(job_name),
                                temp_location = temp_location,
                                region=region,
                                service_account_email = service_account_email
                            )
    
    #this is needed to get the error logs from the bigquery in dataflow job
    filter_str = (
        f'resource.labels.job_name={job_name}'
        f' resource.type="dataflow_step"'
        f' AND severity >= ERROR'
    )
    
    
    #This will only run if the user attempts to submit a file
    if request.method == 'POST':

        #this will only run if what the user submitted is a file
        if 'file' in request.files:
            #this gets the file data
            file = request.files['file']
            #this aquires the name of the file
            filename = secure_filename(file.filename)
            
            #this will check it the file has data to be processed
            if len(file.readlines()) == 0:
                message = "File has no data to process"
            else:
                file.seek(0)
                try:
                    #this will only run if the file is a csv
                    if filename.endswith('.csv'):
                        
                        #these two lines are to upload the data fetched from the front-end to gcp cloud storage
                        blob = bucket.blob('data.csv')
                        blob.upload_from_file(file, timeout=3600)
                        
                        file.seek(0)
                        #this gets the total number of records to be inserted to bigquery. its a side input
                        total_records = len(file.readlines())
                        
                        #this gets the table wanting to be used from the front end
                        table_id = request.form.getlist('checks')[0]
                        SCHEMA = schema_fetch(table_id)
                        
                        #these two lines create a unique name for the files being saved to gcp cloud storage. files with the same name but in differnt buckets are tied together
                        filename_helper = str(datetime.datetime.now())
                        filename = "LOG: "+ filename_helper
                        
                        #in this try block, it will attempted to create the first pipeline that will read the input data and try to write it to bigquery. all failed rows with be stored into the gcp cloud storage
                        try:
                            start = time.time()
                            main_pipeline(beam_options,SCHEMA, job_name,filename,table_id)

                        except Exception as error:
                            print('This was the error: ', error)
                            message = "Error setting up the data ingestion pipeline"
                            
                        
                        #this try block is creating a side input with all the bad rows attempted to be inserted into bigquery
                        try:
                            bad_input_data = get_errors(filename)
                            errors = client.insert_rows_json('{}.{}.{}'.format(project,dataset,table_id), bad_input_data['elements'])
                            error_message = ''
                            with open('temp.txt', 'w') as m:
                                for i in errors:
                                    error_message ="""Error #{}, Message - {}, Location - {}, Inputted Data -  {}\n\n""".format(i['index'], i['errors'][0]['message'],i['errors'][0]['location'],bad_input_data['elements'][i['index']])
                                    m.write(error_message)
                            blob = bucket3.blob('{}.txt'.format(filename))
                            blob.upload_from_filename(local_file_loc, timeout=3600)
                            os.remove("temp.txt")
                            message = "Data uploaded to the Bigquery"
                        except Exception as error:
                            print('This was the error: ', error)
                            message = "Error setting up messaging"

                        
                        end = time.time()
                        Total_time = end - start
                        
                        # Good elements - read number of elements in BigQuery table
                        query = """SELECT count(*) FROM `{}.{}.{}` WHERE job_id = '{}'""".format(str(project),str(dataset),str(table_id),job_name)
                        try:
                            results =  client.query(query)
                        except Exception as e:
                                print("ERROR: could not query")
                        num_good = ""
                        for row in results:
                            num_good = row["f0_"]
                        #Write to GCP bucket
                        msg = '''{}, {} seconds, {}, {}, {}\n'''.format(str(job_name),str(Total_time), str(total_records-1), str(num_good), str(len(bad_input_data['elements'])))
                        if os.path.exists('metrics.csv'):
                            with open('metrics.csv', 'a') as f:
                                f.write(msg)
                        else:
                            with open('metrics.csv', 'w') as f:
                                f.write('Project Id, Execution Time, Total Rows, Total good Rows, Total Bad Rows\n')
                                f.write(msg)
                    
                    #If the file is not a json or the csv this will run
                    else:
                        message = "File type is not excepted"
                    #endif
                except Exception as error:
                    print('This was the error: ', error)
                    message = "There was an error in creating the request"
            #endif
        #This will run if the submition is not a file type
        elif 'file' not in request.files:
            message = "There was no file to upload"
        #endif
    #endif

    #this will render the template on the website
    return render_template("front.html", message = message, tables = tables)


In [None]:
if __name__ == '__main__':
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [25/Apr/2022 14:47:54] "[32mGET / HTTP/1.1[0m" 302 -
127.0.0.1 - - [25/Apr/2022 14:47:54] "[37mGET /upload HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [25/Apr/2022 14:47:54] "[37mGET /upload HTTP/1.1[0m" 200 -


  is_streaming_pipeline = p.options.view_as(StandardOptions).streaming
127.0.0.1 - - [25/Apr/2022 14:53:36] "[37mPOST /upload HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [25/Apr/2022 14:53:36] "[37mPOST /upload HTTP/1.1[0m" 200 -
