In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
# Load the JSONL data into a pandas DataFrame
train_data = pd.read_json('Data/train.jsonl', lines=True)
dev_data = pd.read_json('Data/dev.jsonl', lines=True)
test_data = pd.read_json('Data/test.jsonl', lines=True)

print(f"Train data shape: {train_data.shape}")
print(f"Dev data shape: {dev_data.shape}")
print(f"Test data shape: {test_data.shape}")

# Concatenate the datasets into one single DataFrame
full_data = pd.concat([train_data, dev_data, test_data], ignore_index=True)

# Load the embeddings from the .npy file
train_embeddings = np.load('Data/sembed/train.npy')
dev_embeddings = np.load('Data/sembed/dev.npy')
test_embeddings = np.load('Data/sembed/test.npy')

# Concatenate the embeddings into one NumPy array
full_embeddings = np.vstack([train_embeddings, dev_embeddings, test_embeddings])

# Add the concatenated embeddings to the full dataset
full_data['conversation_embeddings'] = list(full_embeddings)

# Display the first few rows of the DataFrame to verify
full_data.head()

Train data shape: (4612, 19)
Dev data shape: (577, 19)
Test data shape: (577, 18)


Unnamed: 0,case_id,title,petitioner,respondent,petitioner_state,respondent_state,petitioner_category,respondent_category,chief_justice,issue_area,year,argument_date,decision_date,court_hearing_length,utterances_number,justices,majority_ratio,court_hearing,successful_appeal,conversation_embeddings
0,case3833,Consolidated Rail Corporation v. Gottshall,Consolidated Rail Corporation,Gottshall,UNKNOWN,UNKNOWN,railroad,"employee, or job applicant, including benefici...",Rehnquist,Economic Activity,1993,2/28/1994,6/24/1994,3349.126,199,"[{'name': 'j__harry_a_blackmun', 'born_year': ...",0.666667,We'll hear argument first this morning in Numb...,1.0,"[-0.018852338, 0.030868832, 0.056592677, 0.077..."
1,case2255,Army & Air Force Exchange Service v. Sheehan,Army & Air Force Exchange Service,Sheehan,UNKNOWN,United States,United States,governmental employee or job applicant,Burger,Judicial Power,1981,2/23/1982,6/1/1982,2658.159,73,"[{'name': 'j__thurgood_marshall', 'born_year':...",1.0,We will hear arguments next in Army and Air Fo...,1.0,"[-0.07904932, -0.009505805, 0.006709932, 0.009..."
2,case5247,RadLAX Gateway Hotel v. Amalgamated Bank,"RadLAX Gateway Hotel, LLC., et al.",Amalgamated Bank,UNKNOWN,UNKNOWN,debtor,"bank, savings and loan, credit union, investme...",Roberts,Civil Rights,2011,4/23/2012,5/29/2012,3510.266,225,"[{'name': 'j__john_g_roberts_jr', 'born_year':...",1.0,We'll hear argument first this morning in Case...,0.0,"[-0.10165222, 0.036206104, -0.048806086, -0.02..."
3,case5518,Williams v. Pennsylvania,Terrance Williams,Pennsylvania,UNKNOWN,Pennsylvania,"prisoner, inmate of penal institution",State,Roberts,Due Process,2015,2/29/2016,6/9/2016,3603.375,276,"[{'name': 'j__john_g_roberts_jr', 'born_year':...",0.625,"We'll hear argument next in Case 15-5040, Will...",1.0,"[-0.0079574855, 0.038914457, 0.0008458691, -0...."
4,case2296,In re R. M. J.,,In Re R. M. J.,UNKNOWN,Missouri,"attorney, or person acting as such;includes ba...",state or U.S. supreme court,Burger,Attorneys,1981,11/9/1981,1/25/1982,3319.019,243,"[{'name': 'j__thurgood_marshall', 'born_year':...",1.0,We will hear arguments next in No. 80-1431.|||...,1.0,"[-0.059029054, 0.03719178, 0.017748807, -0.094..."


In [3]:
# Create a new categorical column 'file_year' by binning the 'year' column into specified ranges
full_data['file_year'] = pd.cut(full_data['year'], bins=[1900, 2000, 2010, 2020], labels=['before_2000', '2000_2010', '2010_2020'])

# Extract the year from 'argument_date' and create a new column 'argument_year'
full_data['argument_year'] = pd.to_datetime(full_data['argument_date']).dt.year

# Calculate the lagged time between 'argument_year' and 'year' and create a new column 'lagged_time'
full_data['lagged_time'] = full_data['argument_year'] - full_data['year']

# Extract the quarter from 'argument_date' and create a new column 'argument_quarter'
full_data['argument_quarter'] = pd.to_datetime(full_data['argument_date']).dt.quarter

In [4]:
# Convert 'argument_date' and 'decision_date' columns to datetime format
full_data['argument_date'] = pd.to_datetime(full_data['argument_date'], format='%m/%d/%Y')
full_data['decision_date'] = pd.to_datetime(full_data['decision_date'], format='%m/%d/%Y')

# Calculate the hearing length in months and create a new column 'hearing_length_months'
full_data['hearing_length_months'] = (full_data['decision_date'].dt.year - full_data['argument_date'].dt.year) * 12 + (full_data['decision_date'].dt.month - full_data['argument_date'].dt.month)

In [5]:
# Normalize numerical features
scaler = StandardScaler()

# Replace zero court_hearing_length values with a small value to avoid division by zero
full_data['court_hearing_length'].replace(0, 1e-5, inplace=True)

full_data['speech_rate'] = full_data['utterances_number'] / full_data['court_hearing_length']

# Add 'lagged_time' to the list of numerical columns to be normalized
numerical_cols = ['court_hearing_length', 'utterances_number', 'speech_rate', 'lagged_time', 'hearing_length_months', 'majority_ratio']

# Normalize numerical columns including 'lagged_time'
full_data[numerical_cols] = scaler.fit_transform(full_data[numerical_cols])


In [6]:
# Create separate encoders for each categorical column
issue_area_encoder = OneHotEncoder(sparse_output=False)
file_date_encoder = OneHotEncoder(sparse_output=False)
argument_quarter_encoder = OneHotEncoder(sparse_output=False)

# Fit and transform each categorical feature separately
issue_area_encoded = issue_area_encoder.fit_transform(full_data[['issue_area']])
file_date_encoded = file_date_encoder.fit_transform(full_data[['file_year']])
argument_quarter_encoded = argument_quarter_encoder.fit_transform(full_data[['argument_quarter']])

# Create DataFrames for each of the encoded features with proper column names
issue_area_df = pd.DataFrame(issue_area_encoded, columns=issue_area_encoder.get_feature_names_out(['issue_area']))
file_date_df = pd.DataFrame(file_date_encoded, columns=file_date_encoder.get_feature_names_out(['file_year']))
argument_quarter_df = pd.DataFrame(argument_quarter_encoded, columns=argument_quarter_encoder.get_feature_names_out(['argument_quarter']))

# Concatenate the encoded features back to the original DataFrame
full_data = pd.concat([full_data, issue_area_df, file_date_df, argument_quarter_df], axis=1)


In [7]:
# Define a function to extract information about the justices
def extract_justice_info(justice_list):
    political_directions = [j['political_direction'] for j in justice_list]
    return pd.Series({
        'liberal_justices': political_directions.count('Liberal'),
        'conservative_justices': political_directions.count('Conservative')
    })

# Apply the function to extract new justice-related features
justice_info = full_data['justices'].apply(extract_justice_info)
# Compute the liberal ratio
justice_info['liberal_ratio'] = justice_info['liberal_justices'] / (justice_info['liberal_justices'] + justice_info['conservative_justices'])

full_data = pd.concat([full_data, justice_info], axis=1)

In [8]:
# Drop the original categorical column if it's no longer needed
full_data.drop(['court_hearing'], axis=1, inplace=True)
full_data.drop(['argument_date'], axis=1, inplace=True)
full_data.drop(['argument_year'], axis=1, inplace=True)
full_data.drop(['year'], axis=1, inplace=True)
full_data.drop(['issue_area', 'file_year', 'argument_quarter'], axis=1, inplace=True)

# Drop useless columns
full_data.drop(['case_id', 'title'], axis=1, inplace=True)


In [9]:
full_data.columns

Index(['petitioner', 'respondent', 'petitioner_state', 'respondent_state',
       'petitioner_category', 'respondent_category', 'chief_justice',
       'decision_date', 'court_hearing_length', 'utterances_number',
       'justices', 'majority_ratio', 'successful_appeal',
       'conversation_embeddings', 'lagged_time', 'hearing_length_months',
       'speech_rate', 'issue_area_Attorneys', 'issue_area_Civil Rights',
       'issue_area_Criminal Procedure', 'issue_area_Due Process',
       'issue_area_Economic Activity', 'issue_area_Federal Taxation',
       'issue_area_Federalism', 'issue_area_First Amendment',
       'issue_area_Interstate Relations', 'issue_area_Judicial Power',
       'issue_area_Miscellaneous', 'issue_area_Privacy',
       'issue_area_Private Action', 'issue_area_UNKNOWN', 'issue_area_Unions',
       'file_year_2000_2010', 'file_year_2010_2020', 'file_year_before_2000',
       'argument_quarter_1.0', 'argument_quarter_2.0', 'argument_quarter_3.0',
       'argument_qu

In [10]:
# Define the columns to be processed and included in the final DataFrame
processed_columns = ['petitioner', 'respondent', 'petitioner_state', 'respondent_state', 'petitioner_category', 'respondent_category',
                     'issue_area_Attorneys', 'issue_area_Civil Rights', 'issue_area_Criminal Procedure', 'issue_area_Due Process', 'issue_area_Economic Activity', 'issue_area_Federal Taxation', 'issue_area_Federalism', 'issue_area_First Amendment', 'issue_area_Interstate Relations', 'issue_area_Judicial Power', 'issue_area_Miscellaneous', 'issue_area_Privacy', 'issue_area_Private Action', 'issue_area_UNKNOWN', 'issue_area_Unions',
                     'file_year_2000_2010', 'file_year_2010_2020', 'file_year_before_2000', 'argument_quarter_1.0', 'argument_quarter_2.0', 'argument_quarter_3.0', 'argument_quarter_4.0', 'argument_quarter_nan',
                     'court_hearing_length', 'utterances_number', 'lagged_time', 'speech_rate', 'hearing_length_months',
                     'conversation_embeddings',  # This will be dropped later after extracting individual embeddings
                     'decision_date', 'majority_ratio',
                     'chief_justice', 'justices', 'liberal_justices', 'conservative_justices', 'liberal_ratio', 
                     'successful_appeal']

# Reindex the DataFrame to include only the specified columns
full_data = full_data.reindex(columns=processed_columns)

In [11]:
# Convert the 'conversation_embeddings' column into a DataFrame with individual embedding columns
embedding_columns = pd.DataFrame(full_data['conversation_embeddings'].tolist(), 
                                                                 columns=[f'embedding_{i}' for i in range(1, 385)])

# Concatenate the new embedding columns back to the original DataFrame
full_data = pd.concat([full_data, embedding_columns], axis=1)

# Drop the original 'conversation_embeddings' column as it is no longer needed
full_data.drop(['conversation_embeddings'], axis=1, inplace=True)

# Define the first set of columns to be included in the final DataFrame
columns_1 = ['petitioner', 'respondent', 'petitioner_state', 'respondent_state', 'petitioner_category', 'respondent_category',
                         'issue_area_Attorneys', 'issue_area_Civil Rights', 'issue_area_Criminal Procedure', 'issue_area_Due Process', 'issue_area_Economic Activity', 'issue_area_Federal Taxation', 'issue_area_Federalism', 'issue_area_First Amendment', 'issue_area_Interstate Relations', 'issue_area_Judicial Power', 'issue_area_Miscellaneous', 'issue_area_Privacy', 'issue_area_Private Action', 'issue_area_UNKNOWN', 'issue_area_Unions',
                         'file_year_2000_2010', 'file_year_2010_2020', 'file_year_before_2000', 'argument_quarter_1.0', 'argument_quarter_2.0', 'argument_quarter_3.0', 'argument_quarter_4.0', 'argument_quarter_nan',
                         'court_hearing_length', 'utterances_number', 'lagged_time', 'speech_rate', 'hearing_length_months']

# Define the second set of columns to be included in the final DataFrame
columns_2 = ['decision_date', 'majority_ratio', 'chief_justice', 'justices', 'liberal_justices', 'conservative_justices', 'liberal_ratio', 'successful_appeal']

# Combine the two sets of columns along with the embedding columns
final_columns = columns_1 + [f'embedding_{i}' for i in range(1, 385)] + columns_2

# Reindex the DataFrame to include only the specified columns in the desired order
full_data = full_data.reindex(columns=final_columns)

In [12]:
# Split back into original sets
train_data_final = full_data.iloc[:len(train_data)]
dev_data_final = full_data.iloc[len(train_data):len(train_data) + len(dev_data)]
test_data_final = full_data.iloc[len(train_data) + len(dev_data):]

# Verify the split
print(f"Train data shape: {train_data_final.shape}")
print(f"Dev data shape: {dev_data_final.shape}")
print(f"Test data shape: {test_data_final.shape}")

Train data shape: (4612, 426)
Dev data shape: (577, 426)
Test data shape: (577, 426)


In [13]:
# Drop the 'successful_appeal' column from the test dataset as it is not needed for testing
test_data_final.drop(['successful_appeal'], axis=1, inplace=True)

# Save the processed training data to a CSV file
train_data_final.to_csv('Data/train_processed.csv', index=False)

# Save the processed development data to a CSV file
dev_data_final.to_csv('Data/dev_processed.csv', index=False)

# Save the processed test data to a CSV file
test_data_final.to_csv('Data/test_processed.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_final.drop(['successful_appeal'], axis=1, inplace=True)
