##### Import Libraries

In [6]:
import pandas as pd

##### Read datasets

In [7]:
CJ_releases_df = pd.read_csv('C:/MachineLearning/CJ_Releases_2022.csv')
CJ_receives_df = pd.read_csv('C:/MachineLearning/CJ_Receives_2022.csv')

### Phase I

In [8]:
# Column rename
CJ_releases_df.rename(columns={'Release Date': 'Date', 'Release Type': 'Type'}, inplace=True)
CJ_receives_df.rename(columns={'Receive Date': 'Date', 'Receive Type': 'Type'}, inplace=True)

CJ_releases_df['Record Type'] = 'Release'
CJ_receives_df['Record Type'] = 'Receive'


##### Datasets Integration

In [9]:

CriminalJustice_df = pd.concat([CJ_releases_df, CJ_receives_df], ignore_index=True)

file_path = 'C:\\CJ_ML\\CriminalJustice_df.csv'
CriminalJustice_df.to_csv(file_path, index=False)

In [10]:
# Unique values
print(CriminalJustice_df.nunique())

Date                    326
Type                     23
Inmate Type              42
Gender                    2
Race                      6
Age                      72
County                  251
Offense Code            716
Offense                   4
Offense Description     710
Sentence Date          5623
Offense Date           7783
Sentence (Years)         40
Record Type               2
dtype: int64


In [16]:
# Check for duplicate rows
duplicates = CriminalJustice_df[CriminalJustice_df.duplicated(keep='first')]

num_duplicates = duplicates.shape[0]
print(f"Number of duplicate rows: {num_duplicates}")

if num_duplicates > 0:
    print("Duplicate Rows:")
    print(duplicates)
    duplicate_indices = duplicates.index.tolist()
    print(f"Indices of duplicate rows: {duplicate_indices}")
else:
    print("No duplicate rows found.")

# Remove duplicate rows
CriminalJustice_df = CriminalJustice_df.drop_duplicates(keep='first')

duplicates_after_deletion = CriminalJustice_df[CriminalJustice_df.duplicated(keep='first')]
num_duplicates_after_deletion = duplicates_after_deletion.shape[0]

print(f"Number of duplicate rows after deletion: {num_duplicates_after_deletion}")
if num_duplicates_after_deletion > 0:
    print("There are still duplicate rows after deletion, which is unexpected.")
else:
    print("All duplicate rows have been successfully deleted.")

Number of duplicate rows: 1
Duplicate Rows:
             Date     Type Inmate Type Gender   Race  Age County  \
84000  08/01/2022  New Rcv          ID      M  Black   19  Falls   

       Offense Code  Offense        Offense Description Sentence Date  \
84000      13150005  Violent  AGG ASSLT W/DEADLY WEAPON    07/14/2022   

      Offense Date Sentence (Years) Record Type  
84000   08/26/2021         10 Years     Receive  
Indices of duplicate rows: [84000]
Number of duplicate rows after deletion: 0
All duplicate rows have been successfully deleted.


##### Handling missing values

In [12]:
# Null values
print(CriminalJustice_df.isnull().sum())

Date                      0
Type                      0
Inmate Type               0
Gender                    0
Race                      0
Age                       0
County                    0
Offense Code              0
Offense                   0
Offense Description     276
Sentence Date          8085
Offense Date              4
Sentence (Years)          0
Record Type               0
dtype: int64


In [13]:
# Fill missing  'Offense Description' based on the Offense Code
offense_code_to_description = CriminalJustice_df.dropna(subset=['Offense Description']).groupby('Offense Code')['Offense Description'].agg(pd.Series.mode).to_dict()
CriminalJustice_df.loc[CriminalJustice_df['Offense Description'].isnull(), 'Offense Description'] = CriminalJustice_df.loc[CriminalJustice_df['Offense Description'].isnull(), 'Offense Code'].map(offense_code_to_description)
offense_code_to_description = CriminalJustice_df.dropna(subset=['Offense Description']).groupby('Offense Code')['Offense Description'].agg(lambda x: pd.Series.mode(x).iloc[0]).to_dict()
CriminalJustice_df.loc[CriminalJustice_df['Offense Description'].isnull(), 'Offense Description'] = CriminalJustice_df.loc[CriminalJustice_df['Offense Description'].isnull(), 'Offense Code'].map(offense_code_to_description)

CriminalJustice_df['Offense Description'] = CriminalJustice_df['Offense Description'].fillna('Unknown')

In [14]:
# Fill missing 'Sentence Date' and 'Offense Date' with 'Unknown'
CriminalJustice_df['Sentence Date'] = CriminalJustice_df['Sentence Date'].fillna('Unknown')
CriminalJustice_df['Offense Date'] = CriminalJustice_df['Offense Date'].fillna('Unknown')

CriminalJustice_df.to_csv('C:\\CJ_ML\\CriminalJustice_df_filledDates.csv', index=False)

print(CriminalJustice_df.isnull().sum())

Date                   0
Type                   0
Inmate Type            0
Gender                 0
Race                   0
Age                    0
County                 0
Offense Code           0
Offense                0
Offense Description    0
Sentence Date          0
Offense Date           0
Sentence (Years)       0
Record Type            0
dtype: int64
