## B_F Assessment for BI Engineer - Sina Charandabi

In [1]:
#managing files
import os
import glob
import shutil

#parsing out the text
import re

#processing
import pandas as pd

In [27]:
##Functions to use

# extract board_nums & their associated lines of the text
def board_ext(file_path):
    #line_numfile_path
    i = 0

    #reg_ex
    board_num_regex = r"\[board(\d+)\]"

    #To return lines associated with new boards
    line_num_board = []

    # Initialize lists to store extracted data
    board_nums = []

    #read lines
    with open(file_path, "r") as f:
        for line in f:
            # Extract board number
            match = re.match(board_num_regex, line)     
            i+=1
            if match: 
                board_num = match.group(1)
                board_nums.append(int(board_num)) 
                line_num_board.append(i)
    
    return board_nums, line_num_board  

# extract dimensions
def dimension_ext(file_path):
    #reg_ex
    dimensions_regex = r"dimensions"
    
    # Initialize lists to store extracted data
    dimensions_nums = []    
    
    #read lines
    with open(file_path, "r") as f:
        for line in f:      
            # Extract board dimensions
            match = re.match(dimensions_regex, line) 
            if match:
                dimension_match = re.findall(r'\d+\.\d+', line)
                dimension_num = [float(dim) for dim in dimension_match if float(dim) != 0]
                dimensions_nums.append(dimension_num)
    
    return dimensions_nums   

# extract defects & their associated lines
def defect_ext(file_path):

    #reg_ex
    defect_regex = r"defect"

    #line_num
    i = 0 
    
    #To return lines associated with new boards
    line_num_defect = []
    
    # Initialize lists to store extracted data
    defects_num = []    

    #read lines
    with open(file_path, "r") as f:
        for line in f:      
            # Extract board dimensions
            match = re.match(defect_regex, line) 
            i+=1
            if match:
                defects_match = re.findall(r'\d+(?:\.\d+)?', line)
                defects = [int(n) if "." not in n else float(n) for n in defects_match]
                defects_num.append(defects) 
                line_num_defect.append(i)
    
    return defects_num, line_num_defect 

#Define the mapping function to match defects with board_num inside the script. 
#Notice: keys will be generated inside the script
def map_board(val):
    for i in range(len(keys) - 1):
        if val >= keys[i] and val < keys[i+1]:
            return board_num_line_dict[keys[i]]
    # handle the case where val is greater than or equal to the last key
    return board_num_line_dict[keys[-1]]

In [87]:
##ETL script

#List all .txt files from the raw directory
txt_files = [os.path.basename(file_path) for file_path in glob.glob(
    'C:\\Users\\Goldenhelp.ir\\Documents\\bie_demo\\raw\\*.txt')]

#Initialize dimension df
df_dimensions = pd.DataFrame(columns = ['machine' , 'epochtime' , 'board_num' , 
                                        'board_length' , 'board_width' , 'board_thickness' ,
                                        'allowed_width' , 'allowed_thickness' , 
                                        'board_num_line'])
#Initialize defects df
df_defects = pd.DataFrame(columns = ['machine' , 'epochtime' ,
                                     'defect_num' , 'board_side' , 'defect_type' ,
                                     'top_left' , 'top_right' , 'bottom_left' , 'bottom_right' ,
                                     'defect_num_line'])
                                     

#iterate over each file (each machine_epoch)
for file_path in txt_files:
    
    #Extract machine and epochtime from file name
    file_name = file_path.split("/")[-1]
    machine = file_name.split("__")[0]
    epochtime = int(file_name.split("__")[1].split(".")[0])

    #Extract board_num & their associated lines at each file
    board_nums = board_ext(file_path)[0]
    board_num_lines = board_ext(file_path)[1]
    
    #Extract dimensions at each file
    dimensions = dimension_ext(file_path)
    
    #Extract defects & their associated lines at each file
    defects = defect_ext(file_path)[0]
    defect_num_lines = defect_ext(file_path)[1]
   
    #Create a dictionary of board_num & dimensions, and associated lines of board_nums
    dim_dict = {
        'machine' : machine,
        'epochtime' : epochtime,
        'board_num' : board_nums, 
        'board_length' : [i[0] for i in dimensions],
        'board_width' : [i[1] for i in dimensions],
        'board_thickness' : [i[2] for i in dimensions],
        'allowed_width' : [i[3] for i in dimensions],
        'allowed_thickness' : [i[4] for i in dimensions],
        'board_num_line' : board_num_lines        
    }
    
    #Convert to a temp df 
    df1 = pd.DataFrame.from_dict(dim_dict)
    
    #Create a dictionary of boeard_num & their respective line to match with defects in the following
    board_num_line_dict = pd.Series(df1.board_num.values , index=df1.board_num_line).to_dict()
    #Get the keys of the dictionary in sorted order
    keys = sorted(board_num_line_dict.keys())
    
    #Create a dictionary of defects, and associated lines of defects
    defect_dict = {
        'machine' : machine,
        'epochtime' : epochtime,
        'defect_num' : [i[0] for i in defects],
        'board_side' : [i[1] for i in defects],
        'defect_type' : [i[2] for i in defects],
        'top_left' : [i[3] for i in defects],
        'top_right' : [i[4] for i in defects],
        'bottom_left' : [i[5] for i in defects],
        'bottom_right' : [i[6] for i in defects],
        'defect_num_line' : defect_num_lines        
    }
    
    #Convert to a temp df
    df2 = pd.DataFrame.from_dict(defect_dict)
    
    #Match defects to board_num
    df2['board_num'] = df2['defect_num_line'].apply(map_board)

     #Update dfs
    df_dimensions = pd.concat([df_dimensions, df1], ignore_index=True, sort=False)   
    df_defects = pd.concat([df_defects, df2], ignore_index=True, sort=False)  
    
    #Move the processed file to the processed folder
    shutil.move(file_path, 'C:\\Users\\Goldenhelp.ir\\Documents\\B_F Assessment\\processed')
    
# drop board_num_line from df_dimensions
df_dimensions.drop('board_num_line' , axis=1, inplace = True)

# drop defect_num_line from df_defects & change the order of board_num
df_defects.drop('defect_num_line' , axis=1, inplace = True)
df_defects = df_defects[['machine' , 'epochtime' , 'board_num' , 'defect_num' , 
                 'board_side' , 'defect_type' , 
                 'top_left' , 'top_right' , 'bottom_left' , 'bottom_right']]

# Write the tables to CSV files
df_dimensions.to_csv("C:\\Users\\Goldenhelp.ir\\Documents\\bie_demo\\out\\dimensions.csv", index=False)
df_defects.to_csv("C:\\Users\\Goldenhelp.ir\\Documents\\bie_demo\\out\defects.csv", index=False)    