In [1]:
#Project: Import csv to bigquery table
#StartDate: 4/06/2022
#EndDate: 
#Developer: Bradley Matheson

In [2]:
#pip install apache-beam[gcp]

In [3]:
#pip install --upgrade google-cloud-storage

In [4]:
from flask import Flask,render_template,request
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud import storage
import os, re
import apache_beam as beam
import argparse
from apache_beam.options.pipeline_options import PipelineOptions
from werkzeug.utils import secure_filename

In [5]:
#this is needed to run the website html
app = Flask(__name__)

#secret key is to allow this python code to move variable from the frontend to the backend
app.secret_key = "35367565"

#this is to allow a authenticated user on the project to work on bigquerry
Key_path = r"C:\\Users\Brad\Documents\model-craft-342921-ea36cdb339e7.json"

#this is needed to request a job
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = Key_path
credentials = service_account.Credentials.from_service_account_file(Key_path,scopes=["https://www.googleapis.com/auth/cloud-platform"])
client=bigquery.Client(credentials=credentials,project=credentials.project_id)
table_id='model-craft-342921.testing.Task2'
storage_client=storage.Client(credentials=credentials,project=credentials.project_id)
bucket = storage_client.get_bucket("data_intake4062022")
#SCHEMA = 'name:STRING,id:INTEGER,salary_in_k:FLOAT,phonenumber:STRING'
SCHEMA = {
    'fields' : [
                {'name' : 'name', 'type' : 'STRING', 'mode' : 'REQUIRED'},
                {'name' : 'id', 'type' : 'INTEGER', 'mode' : 'REQUIRED'},
                {'name' : 'salary_in_k', 'type' : 'FLOAT', 'mode' : 'REQUIRED'},
                {'name' : 'phonenumber', 'type' : 'STRING', 'mode' : 'REQUIRED'}
               ]
}
beam_options = PipelineOptions(
                            runner = 'DataflowRunner',
                            project = 'model-craft-342921',
                            job_name = 'pythontobigquerry',
                            temp_location = 'gs://data_intake4062022/temp1',
                            region='us-east1',
                            service_account_email = 'practice-py@model-craft-342921.iam.gserviceaccount.com'
                        )
#input_path = 'gs://dataflow_example41622/data_intake'

class FilterRecord(beam.DoFn):
    def process(self, element):
        import apache_beam as beam
        flag = 0
        for x in range(0, len(element)):
            if(element[x] == element[0]):
                if(element[x].strip().isalpha() == False):
                    flag = 1
                elif(len(element[x]) == 0):
                    flag = 1
                else:
                    pass
            elif(element[x] == element[1]):
                if(element[x].strip().isalpha()):
                    flag = 1
                elif('.' in element[x]):
                    flag = 1
                elif(len(element[x]) == 0):
                    flag = 1
                else:
                    pass
            elif(element[x] == element[2]):
                if(element[x].strip().isalpha()):
                    flag = 1
                elif('.' in element[x] == False):
                    flag = 1
                elif(len(element[x]) == 0):
                    flag = 1
                else:
                    pass
            elif(element[x] == element[3]):
                if(element[x].strip().isalpha() == False):
                    flag = 1
                elif(len(element[x]) == 0):
                    flag = 1
                else:
                    pass
            
        if(flag == 0):
            yield beam.pvalue.TaggedOutput('Good', element)
        else:
            yield beam.pvalue.TaggedOutput('Bad', element)



def discard_incomplete(data):
    """Filters out records that don't have an information."""
    return len(data['name']) > 0 and len(data['id']) > 0 and len(data['salary_in_k']) > 0 and len(data['phonenumber']) > 0

def convert_types(data):
    """Converts string values to their appropriate type."""
    data['name'] = str(data['name']) if 'name' in data else None
    data['id'] = int(data['id']) if 'id' in data else None
    data['salary_in_k'] = float(data['salary_in_k']) if 'salary_in_k' in data else None
    data['phonenumber'] = str(data['phonenumber']) if 'phonenumber' in data else None
    return data

def get_error_report(errors, file_dataframe):
    report = []
    isNull = False
    nullIndex = 0
    column_Names = ["name","id","salary_in_k","phonenumber"]
    try:
        for error_index, error in enumerate(errors):
            error_location = re.findall(r'\d+', error['message'])
            message = error['message']
            try:
                error_value = re.findall(r"'(.*?)'", error['message'])[0]
                found_error_records = file_dataframe[file_dataframe.isin([error_value]).any(axis=1)].values
                error_message = message.replace("field id (position {}) starting at location {}".format(error_location[-2],error_location[-1]),"""column '{}'""".format(column_Names[int(error_location[-2])]))
            except:
                #found_error_records = 'null'
                # Get all the rows containing null values in required columns and get their row numbers
                found_error_records = file_dataframe[file_dataframe.isna().any(axis=1)].values
                error_message = message.replace("index: {} is missing in row starting at position: {}".format(error_location[-2],error_location[-1]),"""'{}'""".format(column_Names[int(error_location[-2])]))
                isNull = True
            if isNull:
                found_error_records = found_error_records[nullIndex]
                nullIndex+=1
                isNull = False
            log_string = """| Error # - {} || Error Message - {} || Inputted Data - {} |""".format(error_index, error_message, found_error_records)
            report.append(log_string)
    except:
        raise
    #return str(report)
    return report


#This will render the frontend of the website to get the json file
@app.route('/', methods=['GET','POST'])
def index():

    #resets the message so that previous messages dont confuse the user
    message = None

    empty = False

    #This will only run if the user attempts to submit a file
    if request.method == 'POST':

        #this will only run if what the user submitted is a file
        if 'file' in request.files:

            #this gets the file data
            file = request.files['file']
            #this aquires the name of the file
            filename = secure_filename(file.filename)
            if filename.endswith('.csv'):

                #this will only run if the file is a csv
                if filename.endswith('.csv'):
                    blob = bucket.blob('data.csv')
                    blob.upload_from_file(file)
                    #bucket = storage_client.get_bucket("practice_error_logs")
                    #filename = "LOG: "+ str(datetime.datetime.now()) + ".txt"
                    try:
                        p = beam.Pipeline(options=beam_options)
                        good, bad = (p | 'ReadData' >> beam.io.ReadFromText('gs://data_intake4062022/data.csv', skip_header_lines =1)
                               | 'Split' >> beam.Map(lambda x: x.split(','))
                                | 'Filter' >> beam.ParDo(FilterRecord()).with_outputs("Good", "Bad")
                                    )
                        (good | 'format to dict' >> beam.Map(lambda x: {"name": str(x[0]), "id": int(x[1]), "salary_in_k": float(x[2]), "phonenumber": str(x[3])}) 
                               #| 'DelIncompleteData' >> beam.Filter(discard_incomplete)
                               #| 'Convertypes' >> beam.Map(convert_types)
                               | 'WriteToBigQuery' >> beam.io.WriteToBigQuery(
                                   '{0}:testing.Task3'.format(credentials.project_id),
                                   schema=SCHEMA,
                                   write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)
                                )
                        (bad | 'log bad records' >> beam.io.WriteToText('gs://practice_error_logs/Bad.csv', shard_name_template = "")
                                )
                        result = p.run()
                        message = "Data uploaded to the Bigquery"
                    except Exception as error:
                        print('This was the error: ', error)
                        message = "Error setting up the pipeline"
                    
    
            #If the file is not a json or the csv this will run
            else:
                message = "File type is not excepted"
            #endif

        #This will run if the submition is not a file type
        elif 'file' not in request.files:
            message = "There was no file to upload"
        #endif
    #endif

    #this will render the template on the website
    return render_template("front.html", message = message)


In [None]:
if __name__ == '__main__':
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


INFO:werkzeug: * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
INFO:werkzeug:127.0.0.1 - - [08/Apr/2022 13:40:56] "[37mGET / HTTP/1.1[0m" 200 -


  is_streaming_pipeline = p.options.view_as(StandardOptions).streaming
  temp_location = p.options.view_as(GoogleCloudOptions).temp_location
INFO:werkzeug:127.0.0.1 - - [08/Apr/2022 13:41:27] "[37mPOST / HTTP/1.1[0m" 200 -


with beam.Pipeline(options=beam_options) as pipeline:
                            records = (
                                pipeline
                                | beam.io.ReadFromText('gs://data_intake4062022/data.csv', skip_header_lines = True)
                                | beam.Map(lambda x : x.split(","))
                                | beam.Map(lambda x: {"name": x[0], "id": x[1], "salary_in_k": x[2], "phonenumber": x[3]})
                                | beam.io.WriteToBigQuery('{0}:testing.Task3'.format(credentials.project_id), schema=SCHEMA, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)
                                
                                )