#### The primary aim of this code was to automate data processing tasks (for project tracking purpose) that involve:
   - Fetching data from a HTML file, that is localy saved on machiene (a cut of project tracking tool with tickets).
   - Gathering data from multiple .xlsx files in a directory (files with ticket-related data).
   - Cleaning, consolidating and saving final rusults to multiple .xlsx files

#### Step-by-step breakdown of what the code does:

- Initialization: Libraries are imported, warnings are ignored and the maximum number of columns to display in pandas is set.

In [None]:
import pandas as pd
import xlsxwriter
import numpy as np
import os
import warnings
from selenium import webdriver
import time

warnings.filterwarnings("ignore")
startTime = time.time()
pd.set_option("display.max_columns", 100)

1. **def process_files()**: processes all .xlsx files. For each file, it reads two specific sheets, cleans the data, drops unnecessary columns, consolidates all the processed data and creates a pivot table. 
    - Each file starts with its own unique ID, and the files always follow a sequence in the folder. If the sequence is broken, it will insert a new line in the place where it was broken, to avoid losing track of a missing project in the future.
    
        ***More detailed on the above:***
        - line 46: retreives unique IDs and set them as index;
        - line 49-50: inserting the data from the 'second_column' dataframe into 'proj_details_tab' at position 0, then changes dtype
        - line 52-53: It creates a new dataframe, then merges this new dataframe with 'proj_details_tab' on the 'Number_projdetails' column using a right join. This keeps all rows from the new dataframe and matching rows from 'proj_details_tab'.
        
        
    - the final dataframe includes various project-related metrics and is structured to have each row represent a project. The function returns the final consolidated dataframe:

In [None]:
def process_files(path):
    global proj_details_tab
    folder = [file for file in os.listdir(path)]
    
    consolidated_files = pd.DataFrame()
    full_names = pd.DataFrame()

    for file in folder:
        Per = pd.concat(pd.read_excel('./analyses/'+file, sheet_name=['Per_File']), ignore_index=True)
        Per = Per.drop_duplicates(subset=['Language Tag', 'Project Name'], keep='last')
        Per['Project Number'] = Per['Project Number'].astype('str')
        Total = pd.concat(pd.read_excel('./analyses/'+file, sheet_name=['Total']), ignore_index=True)
        Per_new = Per.drop(['Translation Type', 'Project Name', 'Language', 'Name', 'ICE', '100% Matches', '99-95% Matches', '94-85% Matches', '84-75% Matches', '74-0% Matches','Reps', 'Total', 'Adjusted Volume', 'Review Volume', 'Date Due', 'Workgroup'], axis=1)
        Total_new = Total.drop(['Total', 'Adjusted Volume', 'Review Volume', 'Language Tag'], axis=1)

        f_name = pd.concat(pd.read_excel(path+'\\'+file, sheet_name=['Total']), ignore_index=True)
        f_name['Project Name'] = file
        file_name = pd.concat([f_name], axis=1)
        file_name = file_name.drop(['Language Tag', 'Language', 'Translation Type', 'ICE', '100% Matches', '99-95% Matches', '94-85% Matches', '84-75% Matches', '74-0% Matches', 'Reps', 'Total', 'Adjusted Volume', 'Review Volume'], axis=1)
        full_names = pd.concat([file_name, full_names], axis=0, ignore_index=True)

        Per_new = pd.pivot_table(Per_new, index=['Language Tag'], values='Project Number', aggfunc=lambda x: ', '.join(x))
        Per_new['Project Number'] = 'PID' + Per_new['Project Number'] 
        Per_new.reset_index(inplace=True)

        consolidated_sheets = pd.concat([Per_new, Total_new], axis=1)[['Language Tag', 'Project Number', 'Language', 'Translation Type','ICE', '100% Matches', '99-95% Matches', '94-85% Matches', '84-75% Matches', '74-0% Matches', 'Reps']]
        consolidated_sheets = consolidated_sheets.iloc[::-1]
        consolidated_files = pd.concat([consolidated_sheets, consolidated_files], axis=0, ignore_index=True)
        consolidated_files.reset_index(drop=True, inplace=True)
        consolidated_files_up = pd.concat([full_names, consolidated_files], axis=1, ignore_index=True)

    consolidated_files.reset_index(drop=True, inplace=True)
    consolidated_files_up = consolidated_files_up.iloc[::-1]
    consolidated_files_up.reset_index(drop=True, inplace=True)
    consolidated_files_up.columns = ['File name', 'Language Tag', 'PIDs', 'Language', 'Translation type', 'ICE', '100% Matches', '99-95% Matches', '94-85% Matches', '84-75% Matches', '74-0% Matches', 'Reps']

    consolidated_files_up[['Project-HO', 'Project name']] = consolidated_files_up['File name'].str.split('_', 1, expand=True)
    consolidated_files_up['Project name'] = consolidated_files_up['Project name'].astype(str).str.rstrip('.xlsx')

    consolidated_files_up.drop(['File name', 'Language Tag'], axis=1, inplace=True)
    consolidated_files_up = consolidated_files_up[['Project-HO', 'PIDs', 'Project name', 'Language', 'Translation type', 'ICE', '100% Matches', '99-95% Matches', '94-85% Matches', '84-75% Matches', '74-0% Matches', 'Reps']]
    consolidated_files_up.columns = ['Project-HO', 'Project Number', 'Project name', 'Language', 'Translation type', 'ICE', '100% Matches', '99-95% Matches', '94-85% Matches', '84-75% Matches', '74-0% Matches', 'Reps']
    
    proj_details_tab = consolidated_files_up
     
    proj_details_tab['temp_column2'] = proj_details_tab['Project-HO'].map(lambda x: x.lstrip('Project-HO')).astype('int')
    proj_details_tab.set_index("temp_column2")
    second_column = proj_details_tab.pop('temp_column2')
    proj_details_tab.insert(0, 'Number_projdetails', second_column)
    proj_details_tab['Number_projdetails'] = proj_details_tab['Number_projdetails'].astype('int')
    
    proj_details_tab = proj_details_tab.merge(how='right', on='Number_projdetails', right = pd.DataFrame({'Number_projdetails':np.arange(proj_details_tab.iloc[0]['Number_projdetails'],
                                                                                                       proj_details_tab.iloc[-1]['Number_projdetails']+1,1)})).reset_index().drop(['index'], axis=1)
    proj_details_tab['Number_projdetails'] = 'Project-HO' + proj_details_tab['Number_projdetails'].astype(str)
    proj_details_tab.rename(columns={'Number_projdetails': 'HO Number'}, inplace=True)
    proj_details_tab = proj_details_tab.drop('Project-HO', axis=1)
    
    return proj_details_tab

2. **def fetch_web_data()**: fetches data from a local webpage using selenium WebDriver. It reads different elements from it (based on various xpath criterias), creates a dictionary of fetched data, and returns the data.
    - it also checks whether the data was fetched incorrectly. If so, it prints out a message.

In [None]:
def fetch_web_data():
    driver = webdriver.Chrome()
    file_path = "file:///C:/Users/admin/Python/HTML.html"
    driver.get(file_path);
    driver.maximize_window();

    summary_data = driver.find_elements("xpath", "//table[@class='tableBorder']/tbody/tr/td/h3/a") 
    lang_code = driver.find_elements("xpath", "//div[@class='table-wrap']/table/tbody/tr[count(*)>3]/td[1]")
    supp_id = driver.find_elements("xpath", "//div[@class='table-wrap']/table/tbody/tr[count(*)>3]/td[2]/b[font] | //div[@class='table-wrap']/table/tbody/tr[count(*)>3]/td[2]/font[b] | //div[@class='table-wrap']/table/tbody/tr[count(*)>3]/td[position()=2 and not(b[font] or font[b])]")
    dates = driver.find_elements("xpath", "//div[@class='table-wrap']/table/tbody/tr[count(*)>3]/td[3]")
    dates_green = driver.find_elements("xpath", "//div[@class='table-wrap']/table/tbody/tr[count(*)>3]/td[3]/b[font] | //div[@class='table-wrap']/table/tbody/tr[count(*)>3]/td[3]/font[b] | //div[@class='table-wrap']/table/tbody/tr[count(*)>3]/td[position()=3 and not(b[font] or font[b])]")
    comments = driver.find_elements("xpath", "//div[@class='table-wrap']/table/tbody/tr[count(*)>3]/td[4]")
    
    # Check if any data fetched is None
    if any(item is None for item in [summary_data, lang_code, supp_id, dates, dates_green, comments]):
        print("Failed to fetch some elements.")
        driver.quit()
        return None, None

    fetched_data = [{"Language_Code": lc.text,
                     "Supplier_ID": si.text,
                     "Dates": d.text,
                     "Handback": d2.text,
                     "Comments": c.text} for lc, si, d, d2, c in zip(lang_code, supp_id, dates, dates_green, comments)]
    
    summary_data_array = [{"Summary": s.text} for s in summary_data]
    
    driver.quit()
    return fetched_data, summary_data_array

def main():
    fetched_data, summary_data_array = fetch_web_data()
    
    if fetched_data is None or summary_data_array is None:
        print("Data fetching failed.")
        return fetched_data, summary_data_array

3. **def process_fetched_data()**: cleans the fetched data by removing leading and trailing whitespaces:

In [None]:
def process_fetched_data(fetched_data, summary_data_array):
    table_data = pd.DataFrame(fetched_data)
    table_data = table_data.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    summary_data_df = pd.DataFrame(summary_data_array)
    summary_data_df = summary_data_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    return table_data, summary_data_df

4. **def convert_to_datetime()**: converts the 'Dates' column in the table data to a standard datetime format. 
    - dates were entered manually and don't always stick to a format that pandas can read, some workaround was needed:

In [None]:
def convert_to_datetime(table_data):
    # convert to datetime
    table_data['Dates'] = (table_data['Dates'].str.replace(' ', '-')
                                               .str.replace('-+', '-', regex=True)
                                               .str.strip())

    table_data_new = table_data['Dates'].str.split('-', expand=True)
    table_data_new[2] = table_data_new[2].str[:4]
    table_data_new = table_data_new[~table_data_new[0].str.contains('TEP Deadline')]
    
    table_data_new = table_data_new.rename(columns={
        0 : "Day", 1 : "Month", 2 : "Year"
    })
    
    table_data['Dates'] = pd.to_datetime(table_data_new['Day'] + '-' + 
                                         table_data_new['Month'] + '-' + 
                                         table_data_new['Year'], 
                                         errors='coerce').dt.date
    return table_data

5. **def replace_values()**: replaces specific placeholder values in the dataframe with NaNs.

In [None]:
def replace_values(table_data):
    # List for replacements 
    REPL = [
        "REPL_VALUE1", "REPL_VALUE2", "REPL_VALUE3"
    ]
    try:
        table_data[["Language_Code", "Supplier_ID", "Dates", "Handback", "Comments"]] = table_data[["Language_Code", "Supplier_ID", "Dates", "Handback", "Comments"]].replace(REPL, np.nan)
    except KeyError as e:
        print(f'Column: {e} is missing in the DF')
        return None
    return table_data

6. **def create_group_column() & def extract_and_map_data()**: 'def create_group_column' creates a new column 'group' which increases its value each time it encounters NaN value (in the "Language_Code" col), then 'def extract_and_map_data' processes the summary data to extract and map the 'Project-HO' and 'Project_name' columns to the fetched table data using group index.

In [None]:
def create_group_column(table_data):
    # below will create "group" column; new number will trigger every time it encounters row with np.nans
    table_data["group"] = table_data["Language_Code"].isna().cumsum()
    table_data = table_data.sort_values(["group", "Language_Code"], ascending=True)
    return table_data

def extract_and_map_data(summary_data_df, table_data):
    delimiter_split_df = summary_data_df["Summary"].str.split(" | ", expand=True)
    map_df = delimiter_split_df[[0]].rename(columns={0: "Project-HO"})
    proj_name = delimiter_split_df[[4]].rename(columns={4: "Project_name"}).assign(name=lambda x: x["Project_name"].str.strip().str.replace("| ", ""))
    
    # distributes Project-HO and Proj. name withing the groups:
    table_data["Project-HO"] = table_data.groupby("group").ngroup().map(map_df["Project-HO"]) #map
    table_data["Project_name"] = table_data.groupby("group").ngroup().map(proj_name["Project_name"])
    return table_data

7. **def map_supplier_names()**: reads external .xlsx file to create a dictionary mapping between 'Supplier ID' and 'Supplier name'. It then maps the 'Supplier ID' in the table data to the corresponding 'Supplier name'.

In [None]:
def map_supplier_names(table_data):
    #dict with names:
    internal = pd.read_excel("../internal_map.xlsx", sheet_name="SuppIDs") 
    internal_dict = dict(zip(internal["Supplier ID"], internal["Supplier name"])) 
    table_data["Supplier_Name"] = table_data["Supplier_ID"].map(internal_dict) 
    return table_data

8. **def filter_and_drop_data()**: cleans the data by replacing a specific values.

In [None]:
def filter_and_drop_data(table_data):
    repl2 = [
        "REPLACE_VALUE"
    ]
    table_data = table_data[table_data["Language_Code"] != "IGNORE"]
    table_data = table_data.dropna(subset=["Language_Code"])
    return table_data

9. **def add_supplier_names()**: adds/replaces values in the 'Supplier_Name' column based on specific conditions in the 'Comments' column.

In [None]:
def add_supplier_names(parsed_cols):
    prefixes = (
        "TRIGGER_VALUE"
    )
    parsed_cols["Supplier_Name"] = np.where(parsed_cols["Comments"].str.startswith(prefixes)), 
                                            "EXTRA_VALUE", parsed_cols["Supplier_Name"])
    parsed_cols["Supplier_ID"] = np.where(parsed_cols["Supplier_Name"] == "TRIGGER_VALUE2", 
                                          "EXTRA_VALUE", parsed_cols["Supplier_ID"])
    return parsed_cols

10. **def main()**: all these functions are called in sequence. Cleaned, processed data is then saved into multiple .xlsx files. It also prints out the number of rows in each file, checks if they match, and prints the execution time

In [20]:
def main():
    global parsed_cols
    warnings.filterwarnings("ignore")
    startTime = time.time()
    
    fetched_data, summary_data_array = fetch_web_data()
    
    if fetched_data is None or summary_data_array is None:
        print("Data fetching failed")
        return

    table_data, summary_data_df = process_fetched_data(fetched_data, summary_data_array)
    table_data = convert_to_datetime(table_data)
    table_data = replace_values(table_data)
    table_data = create_group_column(table_data)
    table_data = extract_and_map_data(summary_data_df, table_data)
    table_data = map_supplier_names(table_data)
    table_data = filter_and_drop_data(table_data)
    parsed_cols = table_data[["Project-HO", "Project_name", "Language_Code", "Supplier_ID", "Supplier_Name", "Dates", "Handback", "Comments"]]
    parsed_cols = add_supplier_names(parsed_cols)
    result = process_files('analyses')
    

    # Save + prints
    parsed_cols.to_excel("Supplier's_info.xlsx", sheet_name="SUPPLIER_DATA", index=False)
    proj_details_tab.to_excel("Analysis_logs.xlsx", sheet_name="PROJECT_DETAILS", index=False)

    num_of_entries_sup = parsed_cols["Supplier_ID"].count()
    print("Supplier's rows: " + str(num_of_entries_sup))
    
    num_of_entries_log = proj_details_tab['HO Number'].count()
    print("Analysis logs rows: " + str(num_of_entries_log))
    
    num_of_nans = proj_details_tab['Project name'].isna().sum()
    if num_of_entries_sup == num_of_entries_log:
        print('Rows match')
    else:
        print("\nMissing files from the folder: " + str(num_of_nans))
    
    executionTime = round((time.time() - startTime), 2)
    print("\nExecution time in seconds: " + str(executionTime))

if __name__ == "__main__":
    main()


Supplier's rows: 83
Analysis logs rows: 83
Rows match

Execution time in seconds: 11.98
