In [1]:
import pandas as pd
import os

# Path to the directory containing the Excel files
directory = "Dirty Data"

# Get a list of all Excel files in the directory
excel_files = [file for file in os.listdir(directory) if file.endswith('.xlsx')]

# Read each Excel file into a pandas DataFrame and store them in a list
dfs = []
for file in excel_files:
    df = pd.read_excel(os.path.join(directory, file))
    dfs.append(df)

# Concatenate all DataFrames in the list along the rows (stack them together)
stacked_df = pd.concat(dfs, ignore_index=True)

# Assuming stacked_df is the DataFrame containing combined data from all Excel files
# Convert "Start time" column to datetime format
stacked_df['Start time'] = pd.to_datetime(stacked_df['Start time'])

# Filter out rows before March 26, 2024
stacked_df = stacked_df[stacked_df['Start time'] >= '2024-03-26']

# Now stacked_df contains only the rows with "Start time" on or after March 26, 2024

# Identifying the Data Agency Errors:

In [2]:
stacked_df['Which agency are you a part of?'] = stacked_df['Which agency are you a part of?'].str.strip()

# Replace non-breaking space (U+00A0) with regular space (U+0020) in the "Which agency are you a part of?" column
stacked_df['Which agency are you a part of?'] = stacked_df['Which agency are you a part of?'].str.replace('\u00A0', '\u0020', regex=False)

#stacked_df

In [3]:
agency_df = pd.read_excel("Agency List.xlsx")
# Remove leading and trailing whitespace from the "Agency" column
agency_df['Agency'] = agency_df['Agency'].str.strip()
#agency_df

In [4]:
# Find values in stacked_df['Which agency are you a part of?'] that are not in agency_df['Agency']
not_found_values = stacked_df[~stacked_df['Which agency are you a part of?'].isin(agency_df['Agency'])]['Which agency are you a part of?'].unique()

# Create a DataFrame with the not found values
not_found_df = pd.DataFrame({'Agency Not Found': not_found_values})

#not_found_df
not_found_df.to_excel("agency_not_found.xlsx", index=False)
#not_found_df

# Fixing the Data Issue:

In [5]:
#Reading in Dataframe with the agencies corrected mapping:
corrected_agency_df = pd.read_excel("Corrected Agency List.xlsx", sheet_name="Corrected Agency List")
#Applying the correct mapping through a join:
merged_df = stacked_df.merge(corrected_agency_df, on='Which agency are you a part of?', how='left')

# Get a list of column names
columns = merged_df.columns.tolist()

# Remove 'New Agency' from the list of columns
columns.remove('New Agency')

# Insert 'New Agency' at the 6th position
columns.insert(5, 'New Agency')

# Reorder the columns in merged_df
merged_df = merged_df[columns]

# Drop the column from merged_df that contains the incorrect values:
merged_df.drop(columns=['Which agency are you a part of?'], inplace=True)

#Renamming column with the correct Agency to match question:
merged_df.rename(columns={'New Agency': 'Which agency are you a part of?'}, inplace=True)

#merged_df
#Checking to make sure it worked.
#filtered_df = merged_df[merged_df['Which agency are you a part of?'].isin(not_found_values)]
#pd.set_option('display.max_rows', None)
#filtered_df[['Which agency are you a part of?', 'New Agency']]

In [6]:
merged_df['OKSEES Year'] = merged_df['Start time'].dt.year
merged_df.insert(0, 'OKSEES Year', merged_df.pop('OKSEES Year'))
#merged_df

In [7]:
cabinet_df = pd.read_excel("Corrected Agency List.xlsx", sheet_name="Agency to Cabinet")
merged_df_2 = merged_df.merge(cabinet_df, on='Which agency are you a part of?', how='left')
pd.set_option('display.max_columns', None)
merged_df_2.insert(11, 'Cabinet', merged_df_2.pop('Cabinet'))
#merged_df_2
#cabinet_df

In [8]:
#Create a list that has values that need "." removed:
period_columns = ['How long have you worked for your agency?', 
                  'In which generation were you born?', 
                  'Are you a full-time employee?', 
                  'How would you describe your work setting?']

for col in period_columns:
    merged_df_2[col] = merged_df_2[col].str.replace('.', '')


  merged_df_2[col] = merged_df_2[col].str.replace('.', '')


In [9]:
#Before we save we need to remove "Legislative Branch", "Governor", and "Judicial Branch":
# Define the values to be removed
values_to_remove = ["Legislative Branch", "Governor", "Judicial Branch", "Cabinet Not Found"]

# Filter the dataframe
merged_df_2 = merged_df_2[~merged_df_2['Cabinet'].isin(values_to_remove)]

In [10]:
# Create a Pandas Excel writer
writer = pd.ExcelWriter('Cleaned Data/OKSEES Clean Data 2024.xlsx', engine='xlsxwriter')
    # Write merged_df to the Excel file
# Write merged_df to the Excel file
merged_df_2.to_excel(writer, index=False, sheet_name='Sheet1')

# Access the workbook and worksheet objects
workbook = writer.book
worksheet = writer.sheets['Sheet1']

# Turn on autofilter for all columns
worksheet.autofilter(0, 0, merged_df_2.shape[0], merged_df_2.shape[1] - 1)

# Adjust column width to fit the length of the text
for i, col in enumerate(merged_df_2.columns):
    column_len = max(merged_df_2[col].astype(str).str.len().max(), len(col)) + 2
    worksheet.set_column(i, i, column_len)

# Save the Excel file
writer.close()