In [None]:
import pandas as pd
from google.colab import files

# Upload the dataset
uploaded = files.upload()

In [None]:
df = pd.read_csv('DPIC Execution Database - U.S. Executions (1) (1).csv')
df.head()

In [None]:
# Filter neccessary columns
df_filtered = df[['Execution Volunteer', 'Region', 'State', 'Execution Date']]
df_filtered.head()

In [None]:
# Remove rows where the 'Region' column is 'Northeast'
df_filtered = df_filtered[df_filtered['Region'] != 'Northeast']
df_filtered.head()

In [None]:
# Replace rows that have West with Midwest
df_filtered['Region'] = df_filtered['Region'].replace(['West'], 'Midwest')

print(df_filtered)


In [None]:
# Binary encode the 'Execution Volunteer' target variable(column) to map yes to 1 and no to 0
df_filtered['Execution Volunteer'] = df_filtered['Execution Volunteer'].map({'yes': 1, 'no': 0})
print(df_filtered.head())

In [None]:
# One-hot encode the 'Region' column
df_encoded = pd.get_dummies(df_filtered, columns=['Region'], prefix='Region')

# Ensure only 'Region_Midwest' and 'Region_South' are present
df_encoded = df_encoded[['Region_Midwest', 'Region_South'] + [col for col in df_encoded.columns if col not in ['Region_Midwest', 'Region_South']]]

# Display the first few rows to confirm encoding
print(df_encoded.head())


In [None]:
# Delete the last column
df_encoded = df_encoded.iloc[:, :-1]

print(df_encoded.head())

In [None]:
# Convert True/False to 1/0 in the entire DataFrame
df_encoded[['Region_Midwest', 'Region_South']] = df_encoded[['Region_Midwest', 'Region_South']].astype(int)

# Display the first few rows to confirm the changes
print(df_encoded.head())


In [None]:
# Downloading the current dataframe
#df_encoded.to_csv('df_encoded.csv', index=False)
#files.download('df_encoded.csv')

In [None]:
# Compute the mean of 'Execution Volunteer' for each state
state_target_mean = df_encoded.groupby('State')['Execution Volunteer'].mean()

# Save the mapping to a separate DataFrame
state_mapping = state_target_mean.reset_index()
state_mapping.columns = ['State', 'State_Encoded']

# Map the mean values back to the 'State' column in the original DataFrame
df_encoded['State_Encoded'] = df_encoded['State'].map(state_target_mean)

# Display the reference table
print(df_encoded.head())
print(state_mapping)


In [None]:
# Downloading the current dataframe
#df_encoded.to_csv('df_encoded.csv', index=False)
#files.download('df_encoded.csv')

In [None]:
df_encoded['Execution Date'] = pd.to_datetime(df_encoded['Execution Date'], format='%m/%d/%y', errors='coerce')
# Changed the format to '%m/%d/%y' to handle 2-digit years
# Added errors='coerce' to handle potential parsing errors by setting invalid dates to NaT (Not a Time)
print(df_encoded.head())

In [None]:
legal_change_date = pd.to_datetime('1994-09-13', format='%Y-%m-%d')

# Create the flag column 'Pre_or_Post_Legal_Change'
df_encoded['Pre_or_Post_Legal_Change'] = (df_encoded['Execution Date'] >= legal_change_date).astype(int)

# Display the final Dataframe with the new flag
print("\nFinal DataFrame with Pre/Post Flag:")
print(df_encoded[['Execution Date', 'Pre_or_Post_Legal_Change']])

In [None]:
# Downloading the current dataframe
# df_encoded.to_csv('df_encoded.csv', index=False)
# files.download('df_encoded.csv')

In [None]:
from sklearn.model_selection import train_test_split

# Features and target
X = df_encoded[['Region_Midwest', 'Region_South', 'State_Encoded', 'Pre_or_Post_Legal_Change']]
y = df_encoded['Execution Volunteer']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print(f"Training features shape: {X_train.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Testing features shape: {X_test.shape}")
print(f"Testing target shape: {y_test.shape}")

In [None]:
print("Column Names in Dataset:\n", df_encoded.columns)

In [None]:
# Apply SMOTE(Sunthetic Minority Oversampling Technique) since the data is imbalanced

from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Display the shapes of the resampled datasets
print(f"Resampled Training features shape: {X_train_resampled.shape}")
print(f"Resampled Training target shape: {y_train_resampled.shape}")

In [None]:
# Trying Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Instantiate the Random Forest Classifier
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Predictions on the test data
y_pred_rf = rf_model.predict(X_test)

# Evaluate model performance
print("Confusion Matrix (Random Forest):")
print(confusion_matrix(y_test, y_pred_rf))

print("\nClassification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))

print("\nAccuracy Score (Random Forest):")
print(accuracy_score(y_test, y_pred_rf))