##### Import Libraries

In [9]:
import pandas as pd

##### Read datasets

In [10]:
CJ_releases_df = pd.read_csv('C:/MachineLearning/CJ_Releases_2022.csv')
CJ_receives_df = pd.read_csv('C:/MachineLearning/CJ_Receives_2022.csv')

### Phase I

In [11]:
# Column rename
CJ_releases_df.rename(columns={'Release Date': 'Date', 'Release Type': 'Type'}, inplace=True)
CJ_receives_df.rename(columns={'Receive Date': 'Date', 'Receive Type': 'Type'}, inplace=True)

CJ_releases_df['Record Type'] = 'Release'
CJ_receives_df['Record Type'] = 'Receive'


##### Datasets Integration

In [12]:

CriminalJustice_df = pd.concat([CJ_releases_df, CJ_receives_df], ignore_index=True)

file_path = 'C:\\CJ_ML\\CriminalJustice_df.csv'
CriminalJustice_df.to_csv(file_path, index=False)

In [13]:
# Unique values
print(CriminalJustice_df.nunique())

Date                    326
Type                     23
Inmate Type              42
Gender                    2
Race                      6
Age                      72
County                  251
Offense Code            716
Offense                   4
Offense Description     710
Sentence Date          5623
Offense Date           7783
Sentence (Years)         40
Record Type               2
dtype: int64


##### Handling missing values

In [14]:
# Null values
print(CriminalJustice_df.isnull().sum())

Date                      0
Type                      0
Inmate Type               0
Gender                    0
Race                      0
Age                       0
County                    0
Offense Code              0
Offense                   0
Offense Description     276
Sentence Date          8085
Offense Date              4
Sentence (Years)          0
Record Type               0
dtype: int64


In [15]:
# Fill missing  'Offense Description' based on the Offense Code
offense_code_to_description = CriminalJustice_df.dropna(subset=['Offense Description']).groupby('Offense Code')['Offense Description'].agg(pd.Series.mode).to_dict()
CriminalJustice_df.loc[CriminalJustice_df['Offense Description'].isnull(), 'Offense Description'] = CriminalJustice_df.loc[CriminalJustice_df['Offense Description'].isnull(), 'Offense Code'].map(offense_code_to_description)
offense_code_to_description = CriminalJustice_df.dropna(subset=['Offense Description']).groupby('Offense Code')['Offense Description'].agg(lambda x: pd.Series.mode(x).iloc[0]).to_dict()
CriminalJustice_df.loc[CriminalJustice_df['Offense Description'].isnull(), 'Offense Description'] = CriminalJustice_df.loc[CriminalJustice_df['Offense Description'].isnull(), 'Offense Code'].map(offense_code_to_description)

CriminalJustice_df['Offense Description'] = CriminalJustice_df['Offense Description'].fillna('Unknown')

In [18]:
# Fill missing 'Sentence Date' and 'Offense Date' with 'Unknown'
CriminalJustice_df['Sentence Date'] = CriminalJustice_df['Sentence Date'].fillna('Unknown')
CriminalJustice_df['Offense Date'] = CriminalJustice_df['Offense Date'].fillna('Unknown')

CriminalJustice_df.to_csv('C:\\CJ_ML\\CriminalJustice_df_filledDates.csv', index=False)

print(CriminalJustice_df.isnull().sum())

Date                   0
Type                   0
Inmate Type            0
Gender                 0
Race                   0
Age                    0
County                 0
Offense Code           0
Offense                0
Offense Description    0
Sentence Date          0
Offense Date           0
Sentence (Years)       0
Record Type            0
dtype: int64
