In [1]:
import pandas as pd
import os
import csv
import nltk
import string

# Identify the working directory and data files
working_directory = "./37-Determine-Primary-Labels"

# Create the working directory if needed
try:
    os.makedirs(working_directory, exist_ok=True)
except OSError as error:
    print(f"Error creating {working_directory}: {error}")

In [2]:
TAG = "2023_05_14"

In [3]:
# Read the CSV file and replace blank cells with 0
data = pd.read_csv(f"./labeled_data/FOI_TEXT_WITH_ROLES_MANAULLY_LABELED_2023_05_01.csv").fillna(0)

In [4]:
data = data.rename(columns={"Row Labels": "ROW_ID"})

In [5]:
data.shape

(5688, 6)

In [6]:
data.head()

Unnamed: 0,ROW_ID,Stakeholder Unassigned,Quality Engineering,Manufacturing Engineer,Management,Design Engineer
0,785,1.0,2.0,1.0,0.0,0.0
1,868,1.0,2.0,0.0,1.0,1.0
2,3992,3.0,1.0,0.0,0.0,0.0
3,5199,0.0,3.0,0.0,0.0,0.0
4,5866,0.0,3.0,0.0,0.0,0.0


In [7]:
# Initialize the tie breaker count
tie_breaker_count = 0

# Iterate over each row of data
for index, row in data.iterrows():
    # Find the column with the largest value
    max_value = max(row[1:])

    column_name = data.columns[1:][row[1:].tolist().index(max_value)]

    # Check if there is a tie between columns
    if list(row[1:]).count(max_value) > 1:
        #print(f"TIE DETECTED: Max Value: {max_value}, Row: {row[1:].tolist()}")
        tie_breaker_count += 1

        # Get the index of the tied columns
        tied_columns = [i for i, x in enumerate(row[1:].tolist()) if x == max_value]

        #print(f"\tTied Columns: {tied_columns}")

        # Get the names of the tied columns
        tied_column_names = [
            column_name for column_name in data.columns[1:][tied_columns]
        ]

        #print(f"\tTied Column Names: {tied_column_names}")

        # If Quality Engineering is one of the tied columns, choose any other option
        if "Quality Engineering" in tied_column_names:
            
            options = tied_column_names.copy()
            
            options.remove("Quality Engineering")
            
            if "Manufacturing Engineer" in options:
                column_name = "Manufacturing Engineer"
            
            elif "Design Engineer" in options:
                column_name = "Design Engineer"
            
            elif "Management" in options:
                column_name = "Management"
            
            else:
                column_name = "Stakeholder Unassigned"
        else:
            if "Manufacturing Engineer" in tied_column_names:
                column_name = "Manufacturing Engineer"
            
            elif "Design Engineer" in tied_column_names:
                column_name = "Design Engineer"
            
            elif "Management" in tied_column_names:
                column_name = "Management"
            
            else:
                column_name = "Stakeholder Unassigned"

        #print(f"\tTie Broken with: {column_name}")

    # Write the primary label to the DataFrame
    data.at[index, "PRIMARY_LABEL"] = column_name

In [8]:
# Print the number of tie breakers
print(f"Number of tie breakers: {tie_breaker_count}\n")

Number of tie breakers: 1422



In [9]:
# Print a report of the number of each label
label_counts = data.value_counts("PRIMARY_LABEL")
label_counts_df = pd.DataFrame({"PRIMARY_LABEL": label_counts.index, "COUNT": label_counts.values})
label_counts_df["PERCENTAGE"] = label_counts_df["COUNT"] / len(data) * 100

label_counts_df

Unnamed: 0,PRIMARY_LABEL,COUNT,PERCENTAGE
0,Quality Engineering,3352,58.931083
1,Stakeholder Unassigned,1682,29.571027
2,Management,550,9.66948
3,Manufacturing Engineer,59,1.037271
4,Design Engineer,45,0.791139


In [10]:
# Save the updated DataFrame to a new CSV file
data.to_csv(f"{working_directory}/data_with_primary_label_{TAG}.csv", index=False)

In [11]:
data['ROW_ID'] = data['ROW_ID'].astype(str)
data.head()

Unnamed: 0,ROW_ID,Stakeholder Unassigned,Quality Engineering,Manufacturing Engineer,Management,Design Engineer,PRIMARY_LABEL
0,785,1.0,2.0,1.0,0.0,0.0,Quality Engineering
1,868,1.0,2.0,0.0,1.0,1.0,Quality Engineering
2,3992,3.0,1.0,0.0,0.0,0.0,Stakeholder Unassigned
3,5199,0.0,3.0,0.0,0.0,0.0,Quality Engineering
4,5866,0.0,3.0,0.0,0.0,0.0,Quality Engineering


In [12]:
# Read in the working data
df = pd.read_pickle("./21-Preprocess-Combined-Data-v2/dataframe.pickle")

In [13]:
# Remove all columns except ROW_ID and FOI_TEXT
df = df.drop(columns=[col for col in df.columns if col not in ['ROW_ID', 'FOI_TEXT']])

In [14]:
df.head()

Unnamed: 0,ROW_ID,FOI_TEXT
0,1969025,IT WAS REPORTED THAT THE TRANSMITTER LOST CONN...
1,1426265,IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOUR...
2,2609625,IT WAS REPORTED THAT TRANSMITTER FAILED ERROR ...
3,2813837,IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOUR...
4,1337517,IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOUR...


In [15]:
df.shape

(5736, 2)

In [16]:
merged_df = pd.merge(
    df,
    data,
    on="ROW_ID",
    how="inner"
)

In [17]:
# Remove all columns except ROW_ID and FOI_TEXT
merged_df = merged_df.drop(columns=[col for col in merged_df.columns if col not in ['ROW_ID', 'FOI_TEXT', 'PRIMARY_LABEL']])

In [18]:
merged_df.shape

(5692, 3)

In [19]:
# Print a test row to confirm the merge 
merged_df[merged_df["ROW_ID"].eq("785")]

Unnamed: 0,ROW_ID,FOI_TEXT,PRIMARY_LABEL
1972,785,IT WAS REPORTED THAT AN INACCURACY BETWEEN THE...,Quality Engineering


In [20]:
merged_df.head()

Unnamed: 0,ROW_ID,FOI_TEXT,PRIMARY_LABEL
0,1969025,IT WAS REPORTED THAT THE TRANSMITTER LOST CONN...,Quality Engineering
1,1426265,IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOUR...,Quality Engineering
2,2609625,IT WAS REPORTED THAT TRANSMITTER FAILED ERROR ...,Quality Engineering
3,2813837,IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOUR...,Quality Engineering
4,1337517,IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOUR...,Quality Engineering


In [21]:
merged_df.to_csv(f"{working_directory}/labeled_foi_text_{TAG}.csv", index=False)

In [24]:
percent_foi_text_labeled = ( merged_df.shape[0] / df.shape[0] ) * 100
print(f"Percent of FOI Text labeled = {percent_foi_text_labeled}")

Percent of FOI Text labeled = 99.23291492329149
