In [1]:
import pandas as pd

In [58]:
# List of columns to import (avoids attempting to import empty columns)
columns_to_import = ["Case Type","Region","Case Number","Case Name","Status","Date Filed","Date Closed","Reason Closed"
                     ,"City","States & Territories","Employees on charge/petition","Allegations","Participants","Union",
                     "Unit Sought","Voters"]

# Column data types, to avoid Pandas trying to parse data type on every column
column_dtypes = {c:"string" for c in columns_to_import}

In [59]:
# Import various partial data files, parsing dates in the date columns. All other columns are strings.
df1978_1999 = pd.read_csv("1978-1999.csv", usecols = columns_to_import, dtype = column_dtypes, parse_dates = ["Date Filed", "Date Closed"], date_format = "%m%d%y")
df2000_2004 = pd.read_csv("2000-2004.csv", usecols = columns_to_import, dtype = column_dtypes, parse_dates = ["Date Filed", "Date Closed"], date_format = "%m%d%y")
df2005_2009 = pd.read_csv("2005-2009.csv", usecols = columns_to_import, dtype = column_dtypes, parse_dates = ["Date Filed", "Date Closed"], date_format = "%m%d%y")
df2010 = pd.read_csv("2010.csv", usecols = columns_to_import, dtype = column_dtypes, parse_dates = ["Date Filed", "Date Closed"], date_format = "%m%d%y")
df2011 = pd.read_csv("2011.csv", usecols = columns_to_import, dtype = column_dtypes, parse_dates = ["Date Filed", "Date Closed"], date_format = "%m%d%y")
df2012 = pd.read_csv("2012.csv", usecols = columns_to_import, dtype = column_dtypes, parse_dates = ["Date Filed", "Date Closed"], date_format = "%m%d%y")
df2013 = pd.read_csv("2013.csv", usecols = columns_to_import, dtype = column_dtypes, parse_dates = ["Date Filed", "Date Closed"], date_format = "%m%d%y")
df2014 = pd.read_csv("2014.csv", usecols = columns_to_import, dtype = column_dtypes, parse_dates = ["Date Filed", "Date Closed"], date_format = "%m%d%y")
df2015_2019 = pd.read_csv("2015-2019.csv", usecols = columns_to_import, dtype = column_dtypes, parse_dates = ["Date Filed", "Date Closed"], date_format = "%m%d%y")
df2020_2024 = pd.read_csv("2020-2024.csv", usecols = columns_to_import, dtype = column_dtypes, parse_dates = ["Date Filed", "Date Closed"], date_format = "%m%d%y")

In [69]:
# As imported, data files are descending in date. Concatenate into one large data file.
unfair_labor_practices = pd.concat([df2020_2024, df2015_2019, df2014, df2013, df2012, df2011, df2010, df2005_2009, df2000_2004, df1978_1999], ignore_index = True)

# When imported, missing values became "<NA>". Replace these with NaT to match usual Pandas convention by 
# replacing "<NA>" by pd.NaT, then using to_datetime and coercing NaTs.
pd.to_datetime(unfair_labor_practices["Date Closed"].replace("<NA>", pd.NaT), errors = "coerce")
unfair_labor_practices[unfair_labor_practices["Date Filed"] == "<NA>"]

Unnamed: 0,Case Type,Region,Case Number,Case Name,Status,Date Filed,Date Closed,Reason Closed,City,States & Territories,Employees on charge/petition,Allegations,Participants,Union,Unit Sought,Voters
76880,ervice,Lima,OH,45804-4169,,,,,,,,,,,,
383799,,,,,,,,,,,,,,,,
383800,8(a)(5) Refusal to Furnish Information,,,,,,,,,,,,,,,
383801,8(a)(5) Refusal to Bargain/Bad Faith Bargainin...,,,,,,,,,,,,,,,
383802,8(a)(3) Changes in Terms and Conditions of Emp...,,,,,,,,,,,,,,,


In [78]:
# Export into files by year so that file sizes are not too large
# TODO

In [77]:
# TODO: broken entries in rows 10108 - 10111
df2000_2004[df2000_2004["Date Filed"] == "<NA>"]

Unnamed: 0,Case Type,Region,Case Number,Case Name,Status,Date Filed,Date Closed,Reason Closed,City,States & Territories,Employees on charge/petition,Allegations,Participants,Union,Unit Sought,Voters
10108,,,,,,,,,,,,,,,,
10109,8(a)(5) Refusal to Furnish Information,,,,,,,,,,,,,,,
10110,8(a)(5) Refusal to Bargain/Bad Faith Bargainin...,,,,,,,,,,,,,,,
10111,8(a)(3) Changes in Terms and Conditions of Emp...,,,,,,,,,,,,,,,


In [100]:
# TODO: broken entries in rows 1194
pd.to_datetime(df2015_2019["Date Filed"].iloc[24])

Timestamp('2019-12-31 00:00:00')