In [9]:
import requests
from bs4 import BeautifulSoup
import os

In [10]:
BASE_URL = 'https://law.justia.com'
OUTFILE = 'penal_codes.csv'
OUTSIZE = 1 * 1024 * 1024 * 1024  # 1GB
COLUMN_LABELS = 'id, citation\n'

In [11]:
def get_soup_from_url(url, timeout=10):
    try:
        res = requests.get(url, timeout=timeout)
        res.raise_for_status()
        return BeautifulSoup(res.text, 'html.parser')
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

In [12]:
def get_code_contents_from_url(url):
    soup = get_soup_from_url(url)
    if soup:
        content_paragraphs = soup.select('p')
        code_text = " ".join(paragraph.getText().strip().replace("\r", "").replace("\n", " ").replace("  ", " ").lower() for paragraph in content_paragraphs)
        return code_text
    return ""

In [13]:
def get_links_from_url(url):
    soup = get_soup_from_url(url)
    if soup:
        code_listing = soup.select('.codes-listing')
        if not code_listing:
            print('Found base page')
            return []
        links = BeautifulSoup(str(code_listing[0]), 'html.parser').select('a')
        print(f'Found {len(links)} links')
        return links
    return []

In [14]:
def write_column_labels_once(outfile):
    if not os.path.exists(outfile):
        with open(outfile, 'w') as csvfile:
            csvfile.write(COLUMN_LABELS)

In [15]:
def append_data_to_csv(data, outfile):
    with open(outfile, 'a') as csvfile:
        csvfile.write(data)

In [16]:
def get_justia_penal_codes():
    size = 0
    write_column_labels_once(OUTFILE)
    titles = get_links_from_url(f'{BASE_URL}/codes/new-york/2018/pen/part-3/')

    for i, title in enumerate(titles):
        titleUrl = f'{BASE_URL}{title.get("href")}'
        articles = get_links_from_url(titleUrl)

        for j, article in enumerate(articles):
            if size >= OUTSIZE:
                break

            articleUrl = f'{BASE_URL}{article.get("href")}'
            citations = get_links_from_url(articleUrl)

            for k, citation in enumerate(citations):
                if size >= OUTSIZE:
                    break

                id_text = f'Title {i} Article {j} Citation {k}'
                citationUrl = f'{BASE_URL}{citation.get("href")}'
                code_content = get_code_contents_from_url(citationUrl).replace(",", "")
                txt = f'{id_text},{code_content}\n'
                size += len(txt)
                append_data_to_csv(txt, OUTFILE)

    print('Done')


In [17]:
get_justia_penal_codes()

Found 10 links
Found 4 links
Found 7 links
Found 10 links
Found 3 links
Found 7 links
Found 5 links
Found 30 links
Found 4 links
Found 14 links
Found 28 links
Found 18 links
Found 3 links
Found 10 links
Found 21 links
Found 6 links
Found 5 links
Found 12 links
Found 12 links
Found 11 links
Found 4 links
Found 28 links
Found 10 links
Found 19 links
Found 10 links
Found 15 links
Found 7 links
Found 7 links
Found 5 links
Found 20 links
Found 4 links
Found 7 links
Found 35 links
Found 5 links
Found 14 links
Found 19 links
Found 15 links
Found 11 links
Found 35 links
Found 5 links
Found 31 links
Found 12 links
Found 19 links
Found 25 links
Found 11 links
Found 5 links
Found 34 links
Found 2 links
Found 4 links
Found 7 links
Found 14 links
Found 3 links
Found 10 links
Found 16 links
Found 9 links
Found 3 links
Found 29 links
Found 8 links
Found 13 links
Done


#Original

In [20]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('penal_codes.csv')

In [21]:
print(df.head())

                             id  \
0  Title 0 Article 0 Citation 0   
1  Title 0 Article 0 Citation 1   
2  Title 0 Article 0 Citation 2   
3  Title 0 Article 0 Citation 3   
4  Title 0 Article 0 Citation 4   

                                            citation  
0  a person is guilty of criminal solicitation in...  
1  a person is guilty of criminal solicitation in...  
2  a person is guilty of criminal solicitation in...  
3  a person is guilty of criminal solicitation in...  
4  a person is guilty of criminal solicitation in...  


In [23]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch

# Load the CSV file into a DataFrame
df = pd.read_csv('penal_codes.csv')

# Load pre-trained model tokenizer (you may need to adjust the model name as required)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings for a given text
def get_bert_embedding(text):
    # Tokenize and encode the text for BERT
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    # Get the embeddings
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the mean of the last hidden state as the sentence embedding
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

# Ensure CUDA is available for PyTorch, if you have a GPU, to speed up processing
if torch.cuda.is_available():
    model.to('cuda')

# Apply the BERT embedding function to the 'citation' column
# If your dataset is large, consider processing in batches or using tqdm to track progress
df['citation_embeddings'] = df[' citation'].apply(lambda x: get_bert_embedding(x)[:512])

# Now df['citation_embeddings'] will contain the embeddings for each citation text


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
df['citation_embeddings']

0      [-0.045585606, -0.1362382, 0.39512724, -0.1538...
1      [-0.22505607, -0.008862062, 0.29037467, -0.255...
2      [-0.11252634, -0.08307119, 0.43694556, -0.2283...
3      [-0.13397354, -0.18946846, 0.39071646, -0.1566...
4      [-0.14979129, -0.07455121, 0.43193123, -0.2217...
                             ...                        
687    [-0.10705753, -0.009622224, 0.23218858, -0.153...
688    [-0.09208092, -0.0058112266, 0.20358907, -0.12...
689    [-0.15642193, 0.15235814, 0.2109673, -0.140971...
690    [-0.16718373, -0.023032503, 0.16628788, -0.114...
691    [-0.067314036, 0.062474437, 0.35778087, -0.096...
Name: citation_embeddings, Length: 692, dtype: object