In [5]:
# Import Required Libraries
import pandas as pd
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt


In [6]:
# Load datasets
criminal_behavior_data = pd.read_csv('criminal_profiling_data_fixed_preprocessed.csv')
psychological_data = pd.read_csv('psychological_data_preprocessed.csv')
expanded_criminal_data = pd.read_csv('Criminal+expanded.csv')
bpd_victim_based_data = pd.read_csv('BPD_Part_1_Victim_Based_Crime_Data_preprocessed.csv')
household_data = pd.read_csv('NCVS_HOUSEHOLD_PREPROCESSED.csv', dtype={11: 'str'}, low_memory=False)

# Display dataset summaries
print("Criminal Behavior Data:")
print(criminal_behavior_data.head(), '\n')

print("Psychological Data:")
print(psychological_data.head(), '\n')

print("Expanded Criminal Data:")
print(expanded_criminal_data.head(), '\n')

print("BPD Victim Based Data:")
print(bpd_victim_based_data.head(), '\n')

print("Household Data:")
print(household_data.head(), '\n')


Criminal Behavior Data:
     Name Date of Birth  Gender Nationality Criminal Record  \
0  Name_0    1993-03-31    Male  Australian      Cybercrime   
1  Name_1    1952-05-10  Female    American        Burglary   
2  Name_2    1964-10-04    Male    Canadian           Fraud   
3  Name_3    1982-10-04    Male     British         Assault   
4  Name_0    1980-11-23   Other     British         Assault   

                   Residence  Crime Type  Crime Date Crime Time   Latitude  \
0  441 Pine St., Springfield       Fraud  2022-06-22   02:11:30 -55.421982   
1       992 Pine St., Sydney    Burglary  2017-01-09   07:18:39 -14.411651   
2       902 Pine St., Sydney     Assault  2015-06-17   12:58:13 -67.287944   
3   209 Oak St., Springfield     Assault  2021-01-28   22:59:03 -67.484036   
4      617 Oak St., New York  Cybercrime  2011-12-28   13:29:30  77.243051   

    Longitude    Evidence Collected     Victim Info  
0 -123.431543           Fingerprint  Female, Age 28  
1 -111.418383       

In [9]:
# Function to preprocess data
def preprocess_data(df, target_column):
    categorical_cols = df.select_dtypes(include=['object']).columns
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    
    # Fill missing values
    for col in categorical_cols:
        df[col] = df[col].fillna(df[col].mode()[0])  # Fill categorical with mode
    for col in numerical_cols:
        df[col] = df[col].fillna(df[col].median())  # Fill numerical with median
    
    # Add target column if missing
    if target_column not in df.columns:
        df[target_column] = 0  
    
    # Separate features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    return X, y, categorical_cols, numerical_cols

# Preprocess datasets
target_criminal_behavior = 'reoffended'
target_psychological = 'psychological_disorder'
target_expanded_criminal = 'offense_type'
target_bpd_victim = 'victim_severity'
target_household = 'hnumber'

X_criminal_behavior, y_criminal_behavior, cat_cols_cb, num_cols_cb = preprocess_data(criminal_behavior_data, target_criminal_behavior)
X_psychological, y_psychological, cat_cols_psych, num_cols_psych = preprocess_data(psychological_data, target_psychological)
X_expanded_criminal, y_expanded_criminal, cat_cols_ec, num_cols_ec = preprocess_data(expanded_criminal_data, target_expanded_criminal)
X_bpd_victim, y_bpd_victim, cat_cols_bpd, num_cols_bpd = preprocess_data(bpd_victim_based_data, target_bpd_victim)
X_household, y_household, cat_cols_household, num_cols_household = preprocess_data(household_data, target_household)


In [11]:
# Combine all datasets
combined_df = pd.concat([
    X_criminal_behavior, X_psychological, X_expanded_criminal, 
    X_bpd_victim, X_household
], axis=1)

# Add target columns back into the combined dataset
combined_df['reoffended'] = y_criminal_behavior
combined_df['psychological_disorder'] = y_psychological
combined_df['offense_type'] = y_expanded_criminal
combined_df['victim_severity'] = y_bpd_victim
combined_df['hnumber'] = y_household

# Handle missing values
numerical_cols = combined_df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = combined_df.select_dtypes(include=['object']).columns

for col in numerical_cols:
    combined_df[col] = combined_df[col].fillna(combined_df[col].median())
for col in categorical_cols:
    combined_df[col] = combined_df[col].fillna(combined_df[col].mode().iloc[0])

# Standardize numerical features
scaler = StandardScaler()
combined_df[numerical_cols] = scaler.fit_transform(combined_df[numerical_cols])


In [13]:
# Prepare data (features and target for 'reoffended')
target_columns = ['reoffended', 'psychological_disorder', 'offense_type', 'victim_severity', 'hnumber']
X_final = combined_df.drop(columns=target_columns)
y_final = combined_df['reoffended']

# Encode categorical features
encoder = ce.TargetEncoder(cols=categorical_cols)
X_final = encoder.fit_transform(X_final, y_final)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.3, random_state=42)


In [15]:
from sklearn.preprocessing import OneHotEncoder

# Identify categorical columns in X_train
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Apply One-Hot Encoding to categorical columns
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') 

# Fit-transform the categorical features in training data
X_train_encoded = encoder.fit_transform(X_train[categorical_cols])
X_test_encoded = encoder.transform(X_test[categorical_cols])

# Convert encoded features to DataFrame
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(categorical_cols))
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(categorical_cols))

# Drop original categorical columns and concatenate encoded features
X_train = X_train.drop(columns=categorical_cols).reset_index(drop=True)
X_test = X_test.drop(columns=categorical_cols).reset_index(drop=True)

X_train = pd.concat([X_train, X_train_encoded_df], axis=1)
X_test = pd.concat([X_test, X_test_encoded_df], axis=1)


