In [51]:
import pandas as pd
import numpy as np
import duckdb

In [225]:
FILE_PATH: str = "data/Competency Test_Contracts_20250721.csv"

### 0. Overall

In [226]:
df: pd.DataFrame = pd.read_csv(FILE_PATH)

In [5]:
n_rows: int; n_columns: int
n_rows, n_columns = df.shape
print(f"n_rows: {n_rows}")
print(f"n_columns: {n_columns}")

n_rows: 180012
n_columns: 19


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180012 entries, 0 to 180011
Data columns (total 19 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   Purchase Order Description        177231 non-null  object 
 1   Purchase Order (Contract) Number  180012 non-null  object 
 2   Revision Number                   180012 non-null  object 
 3   Specification Number              167785 non-null  object 
 4   Contract Type                     135265 non-null  object 
 5   Start Date                        118018 non-null  object 
 6   End Date                          117240 non-null  object 
 7   Approval Date                     179991 non-null  object 
 8   Department                        169465 non-null  object 
 9   Vendor Name                       180012 non-null  object 
 10  Vendor ID                         180012 non-null  object 
 11  Address 1                         179441 non-null  o

In [9]:
df.head()

Unnamed: 0,Purchase Order Description,Purchase Order (Contract) Number,Revision Number,Specification Number,Contract Type,Start Date,End Date,Approval Date,Department,Vendor Name,Vendor ID,Address 1,Address 2,City,State,Zip,Award Amount,Procurement Type,Contract PDF
0,CDPH-RW-PA: PCHH-CORP PO 116678 SINAI HEALTH S...,302701,0,1042731,DELEGATE AGENCY,,,03/13/2025,CHICAGO DEPARTMENT OF PUBLIC HEALTH,SINAI HEALTH SYSTEM,41617145V,1500 S CALIFORNIA BLVD,,CHICAGO,IL,60608,25222.0,,
1,DEMOLITION,E011442,0,E968020030,,,,,DEPARTMENT OF BUILDINGS,MIDWEST WRECKING COMPANY 01,14878414T,1950 W HUBBARD ST,,CHICAGO,IL,60622,17500.0,,
2,JANITORIAL SUPPLIES,T27500,0,B74851001,,12/01/1997,11/30/1999,,,"CHICAGO UNITED INDUSTRIES, LIMITED",22085024V,53 W JACKSON BLVD # 1450,,CHICAGO,IL,60604-3806,0.0,,
3,MASTER AGREEMENT FOR DEMOLITION SERVICES,C02163,0,B89683203,,,,,DEPARTMENT OF BUILDINGS,"DEMOLITION & DEVELOPMENT, LIMITED.",22414299V,P.0. BOX 10263,,CHICAGO,IL,60610,7000.0,,
4,MASTER AGREEMENT FOR DEMOLITION SERVICES,C02106,0,B89683203,,,,,DEPARTMENT OF BUILDINGS,"DEMOLITION & DEVELOPMENT, LIMITED.",22414299V,P.0. BOX 10263,,CHICAGO,IL,60610,8200.0,,


### 1. Data cleaning

#### Identify and handle missing value in Contract Type, Department, and date fields.

In [None]:
### NULL VALUE IDENTIFICATION ###

# Check the ratio of missing values in each column
df.isnull().mean().sort_values(ascending=False)

Contract PDF                        1.000000
Address 2                           0.892807
Procurement Type                    0.701137
End Date                            0.348710
Start Date                          0.344388
Contract Type                       0.248578
Specification Number                0.067923
Department                          0.058591
Purchase Order Description          0.015449
Zip                                 0.011477
City                                0.005672
Address 1                           0.003172
Approval Date                       0.000117
State                               0.000056
Purchase Order (Contract) Number    0.000000
Vendor ID                           0.000000
Award Amount                        0.000000
Revision Number                     0.000000
Vendor Name                         0.000000
dtype: float64

We found that dataset with columns (`Contract Type`, `End Date`, `Start Date`) includes lots of NULL values while `Department` and `Approval Date` just contain few NULL values.

**Solution:**

It is difficult for me to understand the root cause of missing values. I guess some of human errors or missinformation when they generated the survey that lead that happen.

Therefore, I approach some methods to handle it

1. Drop Missing Values

In [53]:
drop_columns: list
# Case 1: If drop any row with those columns include NULL
drop_columns = ["Department", "Contract Type", "Start Date", "End Date", "Approval Date"]
df_case1: pd.DataFrame = df.dropna(subset=drop_columns)
remain_rows: int = len(df_case1)
remain_ratio: float = remain_rows / n_rows

print(f"remain_rows: {remain_rows}")
print(f"remain_ratio: {remain_ratio:.3f}")

remain_rows: 71626
remain_ratio: 0.398


If we apply that way, the dataset will be removed more than 60% ==> It's really bad!

In [54]:
# Case 2: Drop any row with Department include NULL. Because its NULL raitio is smallest.
drop_columns = ["Department"]
df_case2: pd.DataFrame = df.dropna(subset=drop_columns)

remain_rows: int = len(df_case2)
remain_ratio: float = remain_rows / n_rows

print(f"remain_rows: {remain_rows}")
print(f"remain_ratio: {remain_ratio:.3f}")

remain_rows: 169465
remain_ratio: 0.941


With that way, we reduce the number of dropped rows, but it also takes ~ 6%. From my point of view, it is not a good way! We need to preserve data integrity besides the data quality.

2. Drop columns

I do not choose that way because I want to keep the data integrity and schema

3. Impute missing values

The `Approval Date` just include few NULL value (0.01%) So just leat it as is

I saw the miss match of ration between `Start Date` and `End Date` (for natual sense it must be the same)

In [23]:
sql = """
SELECT
    *
FROM
    df
WHERE
    (
        "Start Date" IS NOT NULL
        AND "End Date" IS NULL
    ) 
    OR (
        "Start Date" IS NULL
        AND "End Date" IS NOT NULL
    )
"""

df_case3: pd.DataFrame = duckdb.query(query=sql).fetch_df_chunk(vectors_per_chunk=100)

In [36]:
df_case3.head()

Unnamed: 0,Purchase Order Description,Purchase Order (Contract) Number,Revision Number,Specification Number,Contract Type,Start Date,End Date,Approval Date,Department,Vendor Name,Vendor ID,Address 1,Address 2,City,State,Zip,Award Amount,Procurement Type,Contract PDF
0,DFSS-CTA-HL-OANDE:,279541,0,1263338,DELEGATE AGENCY,,03/31/2025,08/08/2024,DEPARTMENT OF FAMILY AND SUPPORT SERVICES,THRESHOLDS,93347425L,4101 N RAVENSWOOD AVENUE EFT,,CHICAGO,IL,60613,742447.0,,
1,"PIPE, FITTINGS, VALVES AND ACCESSORIES",9929,14,2312,COMMODITIES,11/01/2005,,07/21/2014,DEPARTMENT OF WATER MANAGEMENT,JOHNSON PIPE & SUPPLY COMPANY INC,15975702V,999 W 37TH STREET EFT ST,,CHICAGO,IL,60609,0.0,BID,
2,"PIPE, FITTINGS, VALVES AND ACCESSORIES",9929,12,2312,COMMODITIES,11/01/2005,,07/15/2014,DEPARTMENT OF WATER MANAGEMENT,JOHNSON PIPE & SUPPLY COMPANY INC,15975702V,999 W 37TH STREET EFT ST,,CHICAGO,IL,60609,0.0,BID,
3,DFSS-CTA-HL-OANDE:,279541,1,1263338,DELEGATE AGENCY,,03/31/2025,08/16/2024,DEPARTMENT OF FAMILY AND SUPPORT SERVICES,THRESHOLDS,93347425L,4101 N RAVENSWOOD AVENUE EFT,,CHICAGO,IL,60613,742447.0,,
4,FENCING SERVICES,15087,7,22200,CONSTRUCTION-GENERAL,08/01/2007,,06/25/2014,DEPT OF GENERAL SERVICES,"FENCE MASTERS, INC",93285910V,20400 S COTTAGE GROVE EFT,,CHICAGO HEIGHTS,IL,60411,0.0,BID,


In [27]:
rows: int = len(df_case3)
ratio: float = rows / n_rows

print(f"rows: {rows}")
print(f"ratio: {ratio:.3f}")

rows: 2364
ratio: 0.013


We found that dataset has ~1% in that case. Therefore, I predict that is the fact (genuine data). So just leave it as is.

In [35]:
# Categorical values in "Contract Type"
df["Contract Type"].value_counts(dropna=False)

Contract Type
DELEGATE AGENCY                        63546
NaN                                    44747
COMPTROLLER-OTHER                      14259
CONSTRUCTION-LARGE $3MILLIONorABOVE     8584
CONSTRUCTION-AVIATION                   6986
ARCH/ENGINEERING                        4702
WORK SERVICES / FACILITIES MAINT.       4315
COMMODITIES                             4126
PRO SERV CONSULTING $250,000orABOVE     3608
CONSTRUCTION-GENERAL                    3456
Modification                            2230
Time Extension                          2066
PROPERTY LEASE                          1964
CONSTRUCTION                            1791
VEHICLES/HEAVY EQUIPMENT (CAPITAL)      1360
CONVERTED                               1207
Delegate Agency                         1151
Term Agreement                          1050
COMMODITIES-SMALL ORDERS                1046
WORK SERV-AVIATION                       874
PRO SERV-AVIATION                        790
DEMOLITION-SMALL ORDERS                  

Above statistics shows us the frequency of values in `Contract Type` columns, it contains few werids value but we ignore that and assume it is correct. 

NULL value in TOP 2 (44747 rows) Those irritate us the impute it with another. So in that case, we decide to fill NULL value with "Unknown" to preserve the descriptive character in that column.

In [39]:
# Categorical values in "Department"
department_counts: pd.Series  = df["Department"].value_counts(dropna=False)

In [52]:
department_counts

Department
DEPARTMENT OF FAMILY AND SUPPORT SERVICES    27794
CHICAGO DEPARTMENT OF TRANSPORTATION         21002
DEPARTMENT OF HUMAN RESOURCES                18008
CHICAGO DEPARTMENT OF PUBLIC HEALTH          13641
DEPARTMENT OF BUILDINGS                      11238
                                             ...  
BOARD OF ELECTION COMMISSIONERS                  1
GENERAL ACCOUNTING                               1
DEPT OF ECONOMIC DEVELOPMENT                     1
DEPT OF SENIOR SERVICES                          1
DEPARTMENT OF ADMINISTRATIVE HEARINGS            1
Name: count, Length: 73, dtype: int64

In [45]:
n_values: int = len(department_counts)
print(f"n_values: {n_values}")

n_values: 73


In [43]:
department_counts[department_counts == 1]

Department
LIC COMM & LOCAL LIQ CTRL COMM           1
LICENSE APPEAL COMMISSION                1
BOARD OF ELECTION COMMISSIONERS          1
GENERAL ACCOUNTING                       1
DEPT OF ECONOMIC DEVELOPMENT             1
DEPT OF SENIOR SERVICES                  1
DEPARTMENT OF ADMINISTRATIVE HEARINGS    1
Name: count, dtype: int64

In [50]:
department_counts[np.nan]

np.int64(10547)

Above analysis give us the quick view about `Department` column that is variety with lots of unnecessary values such as 7 values with only appear 1 time.

therefore we decide to impute the NULL value for `Department` column by "Unknown".

#### Convert date fields (start_date, end_date, approval_date) to standard date formats.

In [216]:
# RE pattern to detect the valid date with format (MM/DD/YYYY)
re_pattern: str = "^(0[1-9]|1[0-2])/([0][1-9]|[12][0-9]|3[01])/(1|2)\\d{3}$"
target_col: str

sql = """
SELECT 
  "{target_col}",
  CASE
    WHEN
      "{target_col}" IS NULL
      THEN
        "{target_col}"
    WHEN 
      REGEXP_FULL_MATCH("{target_col}", '{re_pattern}') 
      THEN 
        "{target_col}"
    ELSE 
      'bad' 
  END AS detected_case1
FROM 
  df
WHERE 
  detected_case1 = 'bad'
;
"""

In [196]:
target_col = "Start Date"
duckdb.sql(
    query=sql.format(
        target_col=target_col,
        re_pattern=re_pattern
    )
).fetchdf()

Unnamed: 0,Start Date,detected_case1
0,01/01/0204,bad
1,07/01/0201,bad
2,07/18/0207,bad
3,01/01/0202,bad
4,03/01/0201,bad
5,10/09/0201,bad


In [197]:
target_col = "End Date"
duckdb.sql(
    query=sql.format(
        target_col=target_col,
        re_pattern=re_pattern
    )
).fetchdf()

Unnamed: 0,End Date,detected_case1
0,11/30/3030,bad


In [198]:
target_col = "Approval Date"
duckdb.sql(
    query=sql.format(
        target_col=target_col,
        re_pattern=re_pattern
    )
).fetchdf()

Unnamed: 0,Approval Date,detected_case1


- `Start Date`: has several error in year field.
- `End Date`: includes a werid date `11/30/3030`, I guess it is the [high date](https://stackoverflow.com/questions/4367050/how-do-you-ignore-records-with-a-date-of-12-31-2999-in-mysql). But in Pandas with datetime64[ns] only contain the date lower than `2262-04-11` date so to process easily I decide to replace `11/30/3030` to `12/31/2261` this is acceptance range of datetime64[ns].
- `Approval Date`: does not include any error.