In [1]:
#Project: Import csv to bigquery table
#StartDate: 4/06/2022
#EndDate: 
#Developer: Bradley, Hongquy, Khoa

In [2]:
from flask import Flask,render_template,request,redirect,url_for
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud import storage
import os, re, datetime, uuid
import apache_beam as beam
import argparse
from apache_beam.options.pipeline_options import PipelineOptions
from werkzeug.utils import secure_filename

In [3]:
#this is to allow a authenticated user on the project to work on bigquerry
Key_path = r"C:\\Users\Brad\Documents\model-craft-342921-ea36cdb339e7.json"

#this is needed to request a job
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = Key_path
credentials = service_account.Credentials.from_service_account_file(Key_path,scopes=["https://www.googleapis.com/auth/cloud-platform"])
client=bigquery.Client(credentials=credentials,project=credentials.project_id)
#table_id='model-craft-342921.testing.Task3'
storage_client=storage.Client(credentials=credentials,project=credentials.project_id)
bucket = storage_client.get_bucket("data_intake4062022")

job_name = 'pythontobigquerry-{}'.format(uuid.uuid4())

beam_options = PipelineOptions(
                            runner = 'DataflowRunner',
                            project = 'model-craft-342921',
                            job_name = '{}'.format(job_name),
                            temp_location = 'gs://data_intake4062022/temp1',
                            region='us-east1',
                            service_account_email = 'practice-py@model-craft-342921.iam.gserviceaccount.com'
                        )

In [4]:
class FilterRecord(beam.DoFn):
    def process(self, element, table_sch):
        import apache_beam as beam
        flag = 0
        error = ''
        if(len(element) > len(table_sch)):
            flag = 1
            error = 'There are too many columns. Expected {} columns, not {}'.format(len(table_sch),len(element))
        else:
            for x in range(0, len(element)):
                #print(table_sch[x]['type'])
                if(table_sch[x]['type'] == 'STRING'):
                    if(element[x].strip().isalpha() == False):
                        #flag = 1
                        #error = 'Expected string not int data type in column {}'.format(table_sch[x]['name'])
                        pass
                    else:
                        pass
                elif(table_sch[x]['type'] == 'INTEGER'):
                    if(element[x].strip().isalpha()):
                        flag = 1
                        error = 'Expected int not string data type in column: {}'.format(table_sch[x]['name'])
                    elif('.' in element[x]):
                        flag = 1
                        error = 'Expected int not float data type in column: {}'.format(table_sch[x]['name'])
                    else:
                        pass
                elif(table_sch[x]['type'] == 'FLOAT'):
                    if(element[x].strip().isalpha()):
                        flag = 1
                        error = 'Expected float not string data type in column: {}'.format(table_sch[x]['name'])
                    elif('.' in element[x] == False):
                        flag = 1
                        error = 'Expected float not int data type in column: {}'.format(table_sch[x]['name'])
                    else:
                        pass
                if(table_sch[x]['mode'] == 'REQUIRED'):
                        if(len(element[x]) == 0):
                            flag = 1
                            error = 'Null in required column: {}'.format(table_sch[x]['name'])
        if(flag == 0):
            yield beam.pvalue.TaggedOutput('Good', element)
            yield beam.pvalue.TaggedOutput('Tgood', 1)
        else:
            error_log = """| Error Message - {} || Inputted Data - {} |\n""".format(error, element)
            yield beam.pvalue.TaggedOutput('Bad', error_log)
            yield beam.pvalue.TaggedOutput('Tbad', 1)
        
            

In [5]:
class formating(beam.DoFn):
    def process(self, element, table_sch):
        import apache_beam as beam
        record = {}
        for x in range(0, len(element)):
            name = table_sch[x]['name']
            if(table_sch[x]['type'] == 'STRING'):
                record[name] = str(element[x])
            elif(table_sch[x]['type'] == 'INTEGER'):
                record[name] = int(element[x])
            elif(table_sch[x]['type'] == 'FLOAT'):
                record[name] = float(element[x])
        yield record

In [6]:
class converter(beam.DoFn):
    def process(self, element, total_records, job_name,countBad):
        import apache_beam as beam
        record = {}
        record[Project_id] = str(job_name)
        record[Total_records] = int(total_records)
        record[Total_good_records] = int(element)
        record[Total_bad_records] = int(countBad)
        yield record

In [7]:
#this is needed to run the website html
app = Flask(__name__)

#secret key is to allow this python code to move variable from the frontend to the backend
app.secret_key = "35367565"

@app.route('/')
def form():
    return redirect(url_for('upload'))

#This will render the frontend of the website to get the json file
@app.route('/upload', methods=['GET','POST'])
def upload():

    #resets the message so that previous messages dont confuse the user
    message = None
    
    tables = ['Task3'] #change--------------------------
    
    #This will only run if the user attempts to submit a file
    if request.method == 'POST':

        #this will only run if what the user submitted is a file
        if 'file' in request.files:
            #this gets the file data
            file = request.files['file']
            #this aquires the name of the file
            filename = secure_filename(file.filename)
            if len(file.readlines()) == 0:
                message = "File has no data to process"
            else:
                file.seek(0)
                try:
                    #this will only run if the file is a csv
                    if filename.endswith('.csv'):
                        blob = bucket.blob('data.csv')
                        blob.upload_from_file(file)
                        
                        file.seek(0)
                        total_records = file.readlines()
                        
                        table_id = request.form.getlist('checks')[0]
                        SCHEMA = {}
                        table_sch = []
                        try:
                            SchemaJob = client.get_table('model-craft-342921.testing.{}'.format(table_id))
                            for s in SchemaJob.schema:
                                new_dict = {}
                                new_dict['name'] = s.name
                                new_dict['type'] = s.field_type
                                new_dict['mode'] = s.mode
                                table_sch.append(new_dict)
                            SCHEMA['fields'] = table_sch

                        except Exception as e:
                            print(e)
                        
                        filename = "LOG: "+ str(datetime.datetime.now()) + ".txt"
                        try:
                            p = beam.Pipeline(options=beam_options)
                            good, bad, Tgood, Tbad = (p | 'ReadData' >> beam.io.ReadFromText('gs://data_intake4062022/data.csv', skip_header_lines =1)
                                   | 'Split' >> beam.Map(lambda x: x.split(','))
                                    | 'Filter' >> beam.ParDo(FilterRecord(),table_sch).with_outputs("Good", "Bad","Tgood","Tbad")
                                        )
                            countBad = (Tbad | 'Count Bad records' >> beam.combiners.Count.Globally()
                            )
                            (Tgood | 'Count Good records' >> beam.combiners.Count.Globally()
                                    | 'format to dict1' >> beam.ParDo(converter(),total_records, job_name, beam.pvalue.AsSingleton(countBad))
                                     | 'WriteToBigQuery1' >> beam.io.WriteToBigQuery(
                                       'model-craft-342921:testing.Statics',
                                       schema=SCHEMA,
                                       write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)
                            )
                            (good | 'format to dict2' >> beam.ParDo(formating(),table_sch) 
                                   | 'WriteToBigQuery2' >> beam.io.WriteToBigQuery(
                                       'model-craft-342921:testing.{}'.format(table_id),
                                       schema='Project_id:STRING,Total_records:INTEGER,Total_good_records:INTEGER,Total_bad_records:INTEGER,Completion_time:STRING',
                                       write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)
                                    )
                            (bad | 'log bad records' >> beam.io.WriteToText('gs://practice_error_logs/{}'.format(filename), shard_name_template = "")
                                    )
                            result = p.run()
                            result.wait_until_finish()
                            message = "Data uploaded to the Bigquery"
                        except Exception as error:
                            print('This was the error: ', error)
                            message = "Error setting up the pipeline"


                    #If the file is not a json or the csv this will run
                    else:
                        message = "File type is not excepted"
                    #endif
                except Exception as error:
                    print('This was the error: ', error)
                    message = "There was an error in creating the request"
            #endif
        #This will run if the submition is not a file type
        elif 'file' not in request.files:
            message = "There was no file to upload"
        #endif
    #endif

    #this will render the template on the website
    return render_template("front.html", message = message, tables = tables)


In [None]:
if __name__ == '__main__':
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


INFO:werkzeug: * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
INFO:werkzeug:127.0.0.1 - - [12/Apr/2022 14:41:03] "[32mGET / HTTP/1.1[0m" 302 -
INFO:werkzeug:127.0.0.1 - - [12/Apr/2022 14:41:03] "[37mGET /upload HTTP/1.1[0m" 200 -


  is_streaming_pipeline = p.options.view_as(StandardOptions).streaming
  temp_location = p.options.view_as(GoogleCloudOptions).temp_location
