In [1]:
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [2]:
# Import data files

# Column data types, to avoid Pandas trying to parse data type on every column
columns_to_import = ["Case Type","Region","Case Number","Case Name","Status","Date Filed","Date Closed","Reason Closed"
                     ,"City","States & Territories","Employees on charge/petition","Allegations","Participants"]
column_dtypes = {c:"string" for c in columns_to_import}

data_by_year = []
for year in ["pre2000"]+list(range(2000, 2025)):
    cases_that_year = pd.read_csv("data/UnfairLaborPractices/cleaned/"+str(year)+".csv",  dtype = column_dtypes, parse_dates = ["Date Filed", "Date Closed"], date_format = "%Y-%m-%d")
    # Need to cast Date Closed column to datetime manually, since some entries are <NA>
    cases_that_year["Date Closed"] = pd.to_datetime(cases_that_year["Date Closed"].replace("<NA>", pd.NaT), errors = "coerce")
    cases_that_year["Employees on charge/petition"] = cases_that_year["Employees on charge/petition"].astype('Int64')
    data_by_year += [cases_that_year] 

ULPs = pd.concat(reversed(data_by_year), ignore_index = True)

In [3]:
# Total number of cases = 400340
len(ULPs)

400340

In [4]:
# Check which years have significant number of cases.
# Seems that 2007 onwards is fairly consistently ~20k/year, so should perhaps
# restrict our attention to those years.
ULPs["Date Filed"].dt.year.value_counts(sort = False)

Date Filed
2024     5407
2023    19408
2022    18113
2021    15178
2020    14733
2019    17809
2018    18891
2017    19145
2016    20933
2015    19915
2014    19385
2013    21143
2012    21032
2011    21335
2010    21824
2009    23015
2008    21125
2007    21686
2006    12794
2005     8818
2004     7114
2003     7962
2002     7496
2001     5122
2000     3882
1999     3235
1998     1634
1997      794
1996      510
1995      314
1994      196
1993      137
1992       93
1991       43
1990       35
1989       32
1988        9
1987        9
1986        9
1985        5
1984        5
1983        3
1982        3
1981        3
1980        1
1978        5
Name: count, dtype: int64

In [5]:
# Get rid of cases prior to 2007. This leaves 340077 cases.
ULPs = ULPs[ULPs["Date Filed"].dt.year >= 2007]
ULPs.shape[0]

340077

In [6]:
# Each case has a number of parties involved, and all of them are stored in the "Participants" column
# in one big messy string. There are three types of parties:
#     1. Charged parties (i.e. the accused)
#     2. Charging parties (i.e. the accuser)
#     3. Involved parties
# Below, we split these types of parties into three new columns "Charged", "Charging", and "Involved"
# Each entry in these columns is a list of parties, each of which is a string

from utilities import get_charged_charging_and_involved
ULPs[["Charged", "Charging", "Involved"]] = pd.DataFrame(ULPs["Participants"].apply(get_charged_charging_and_involved).to_list())

In [7]:
ULPs["Charged"]

0         [Legal Representative, Caulkins Charles, Fishe...
1         [Employer, Christy's Cleaning Service, LLC., G...
2                     [Union, SEIU-UHW, Oakland, CA, 94612]
3         [Union, International Brotherhood of Teamsters...
4                 [Employer, ADAMAS, Rutherford, NJ, 07070]
                                ...                        
340072    [Employer, TRI-STATE COCA-BOTTLING, CINCINNATI...
340073    [Employer, TRI-STATE COCA-COLA, CINCINNATI, OH...
340074    [, THOMPSON COBURN, Not Available, St Louis, M...
340075    [Notification, O'DONNELL JOHN, Murphy Anderson...
340076    [Legal Representative, Brennan Megan, United S...
Name: Charged, Length: 340077, dtype: object

In [None]:
# We are interested in the employers and unions involved in each case, and less
# so in the various legal representatives and notified parties. We filter out
# all the parties except for unions and employers.
# TODO