### This script reads a list of .xls files from two directories, manipulates and compares data from each pair of files, and then writes the data to a new .xls file. The goal is to compare and contrast 'client files' and 'linguistic files', related to a form of language analysis.

#### Here's a detailed breakdown of what it does:

- Import required libraries and suppress warnings to keep output clean:

In [None]:
import pandas as pd
import xlsxwriter
import numpy as np
import os
import time
import glob

startTime = time.time()
pd.set_option('display.float_format', '{:.0f}'.format)
pd.set_option("display.max_columns", 100)

- Reads 'supp_codes.xlsx' file into a pandas dataframe and convert it into a dictionary, 'internal_dict'. This dictionary is used later to map language names to their respective codes:

In [None]:
internal_dict = pd.read_excel('supp_codes.xlsx')
try:
    internal_dict = dict(zip(internal_dict['Target_Language'], internal_dict['Support_Code']))
except KeyError as e:
    print(f"Error: Key {str(e)} not found in the supp_codes.xlsx file. Please check the file's column names.")
    raise

**Set the directories for files:**
   - Create a list of all .xls files in each directory using glob().
   - Check if the number of files in each directory is the same. If not, an error message is displayed.

In [None]:
path_client = 'folder_client'
path_ling = 'folder_ling'

if not os.path.isdir(path_client) or not os.path.isdir(path_ling):
    print(f"Error: The directories {path_client} or {path_ling} do not exist.")
    raise FileNotFoundError(f"The directories {path_client} or {path_ling} do not exist.")
    
client_analyses = glob.glob(os.path.join(path_client, "*.xls"))
ling_analyses = glob.glob(os.path.join(path_ling, "*.xls"))

client_files_count = len(client_analyses)
ling_files_count = len(ling_analyses)

if client_files_count != ling_files_count:
    print(f"Error: The quantity of client analyses and linguistic analyses differs. \nClient analyses: x{client_files_count}, Ling analyses: x{ling_files_count}")
else:
    print(f"No discrepancies found. \nClient analyses: x{client_files_count}, Linguistic analyses: x{ling_files_count}")

Extract unique IDs from each pair of files in the folders. The unique IDs are the last element of the filename when split by "_". These IDs are used to match files between the two directories:

In [None]:
unique_ids = set()

for file_c, file_l in zip(client_analyses, ling_analyses):
    unique_id_c = os.path.basename(file_c).split("_")[-1].split(".")[0]
    unique_id_l = os.path.basename(file_l).split("_")[-1].split(".")[0]
    unique_ids.add(unique_id_c)
    unique_ids.add(unique_id_l)

**For each unique ID, it tries to find a match in each directory. If it doesn't find a match, it notifies user and skips to the next ID**.
   - If it does find a matching pair, it reads each file into a dataframe, sets the column names using the first row  of data, and then removes the first and last rows.
   - Then it creates a pivot table for each dataframe, indexing by 'Language' and calculating the sum for certain columns.

In [None]:
for unique_id in unique_ids:
    file_c = next((file for file in client_analyses if unique_id in file), None)
    file_l = next((file for file in ling_analyses if unique_id in file), None)

    if file_c is None:
        print(f"\n'{unique_id}' not found in the 'folderclient' folder. Please add it and run the code again.")
        continue

    if file_l is None:
        print(f"\n'{unique_id}' not found in the 'folderling' folder. Please add it and run the code again.")
        continue


    df = pd.read_excel(file_c)
    df.columns = df.iloc[0]
    df = df[1:]
    df = df[:-1]
    df.reset_index(drop=True, inplace=True) 

    df_l = pd.read_excel(file_l)
    df_l.columns = df_l.iloc[0]
    df_l = df_l[1:]
    df_l = df_l[:-1]
    df_l.reset_index(drop=True, inplace=True) 
    
    #pivots
    client_pivot = pd.pivot_table(df, index=['Language'],  
                            values=('Segments', 'Context', '100% Match', '75% - 99%', 'No Match', 'Repetitions'), 
                            aggfunc='sum')
    ling_pivot = pd.pivot_table(df_l, index=['Language'], 
                            values=('Segments', 'Context', '100% Match', '75% - 99%', 'Repetitions75% - 99%', 'No Match', 'Repetitions'), 
                            aggfunc='sum')

**It rearranges columns, renames some columns, and does further data manipulations:**
   - Creates a 'group' column that is a copy of the index, and maps 'group' column to 'supp_codes' using the internal_dict.
   - Saves the result and prints out execution time in seconds.

In [1]:
    #changes the position of columns
    client_pivot = client_pivot[['Context', '100% Match', '75% - 99%', 'No Match', 'Repetitions']]
    ling_pivot = ling_pivot[['Context', '100% Match', '75% - 99%', 'Repetitions75% - 99%', 'No Match', 'Repetitions']]


    #empty cols to keep the structure
    client_pivot['99-95% Matches'] = 0
    client_pivot['94-85% Matches'] = 0

    ling_pivot['99-95% Matches'] = 0
    ling_pivot['94-85% Matches'] = 0

    # changes the position of columns and selects specific columns
    client_file = client_pivot[['Context', '100% Match', '99-95% Matches', '94-85% Matches', '75% - 99%', 'No Match', 'Repetitions']]

    # renames the '75% - 99%' column to '75%-99%'
    client_file = client_file.rename(columns={'75% - 99%': '75%-99%'})

    # selects specific columns
    ling_file = ling_pivot[['Context', '100% Match', '99-95% Matches', '94-85% Matches', '75% - 99%', 'Repetitions75% - 99%', 'No Match', 'Repetitions']]

    # combines '75% - 99%' and 'Repetitions75% - 99%' columns into '75%-99%' column + drops them
    ling_file['75%-99%'] = ling_file.pop('75% - 99%') + ling_file.pop('Repetitions75% - 99%')

    # calculates the 'Total' column by summing the values of each row in both dataframes
    client_file['Total'] = client_file.sum(axis=1)
    ling_file['Total'] = ling_file.sum(axis=1)

    # calculates the 'Adjusted_Volume' column
    ling_file['Adjusted_Volume'] = round(ling_file['100% Match'] * 0.2 + ling_file['75%-99%'] * 0.3 + ling_file['No Match'])

    # replaces spaces with underscores in column names and renames columns in both dataframes
    client_file.columns = client_file.columns.str.replace(' ', '_')
    ling_file.columns = ling_file.columns.str.replace(' ', '_')

    # creates 'group' column in both DataFrames based on the index values
    client_file['group'] = client_file.index
    ling_file['group'] = ling_file.index

    # maps values from 'internal_dict' to 'Support_Code' column based on the 'group' column in both DFs
    client_file['Support_Code'] = client_file['group'].map(internal_dict)
    ling_file['Support_Code'] = ling_file['group'].map(internal_dict)

    # drops 'group' column from both DFs
    client_file = client_file.drop('group', axis=1)
    ling_file = ling_file.drop('group', axis=1)

    # renames columns and adjusts the final column positions
    ling_file = ling_file.rename(columns={'Content': 'Context'})[['Support_Code', 'Context', '100%', '99-95%', '94-85%', '75%-99%', 'No_Match', 'Repetitions', 'Total', 'Adjusted_Volume']]

    # adjusts the final column positions in final client DF
    client_file = client_file[['Support_Code', 'Content', '100%', '99-95%', '94-85%', '75%-99%', 'No_Match', 'Re

    # save + time
    with pd.ExcelWriter('Consolidated_' + str(unique_id) + '.xlsx') as writer:
        client_file.to_excel(writer, sheet_name='Client')
        ling_file.to_excel(writer, sheet_name='Linguist')

executionTime = round((time.time() - startTime), 1)
print('\nExecution time in seconds: ' + str(executionTime) + " sec.")

Error: The quantity of client analyses and linguistic analyses differs. 
Client analyses: x10, Ling analyses: x11

'test-project' not found in the 'ab_client' folder. Please add it and run the code again.

Execution time in seconds: 0.5 sec.
