In [16]:
import pandas as pd
import re
import openai

In [17]:
# Replace 'your_file.csv' with the path to your CSV file
df = pd.read_csv('data.csv')

In [18]:
def extract_title_enhanced(case_text):
    case_text = str(case_text)
    
    # First attempt with the original pattern
    match = re.search(r'\[\s*(G\.\s*R\.\s*Nos?\.).*?\]\s*.*?(J\.\:|PER\s*CURIAM\:)', case_text, re.DOTALL)
    if match:
        return match.group(0).strip()
    
    # Second attempt with the refined pattern
    match = re.search(
        r'\[\s*((G\.\s*R\.\s*No\.?\s*L?-?\d+\.?)|(\d+[-\d]*))\.\s*.*?\]\s*(.*?)(?=\.?\s*J\.\:)',
        case_text,
        re.DOTALL
    )
    if match:
        return "[{}] {}".format(match.group(1), match.group(4).strip())
    
    # Third attempt, enhanced to capture additional patterns and variations
    match = re.search(
        r'\[\s*(?:(G\.\s*R\.\s*No\.?|A\.M\.\s*No\.)\s*(L?-?\d+[-\d]*))\s*(?:.*?\.\s*)?\]\s*(THE\s*.*?|\w+\s*\w+\s*VS\.\s*\w+.*?)(?:\.\s*D\s*E\s*C\s*I\s*S\s*I\s*O\s*N\s*|R\s*E\s*S\s*O\s*L\s*U\s*T\s*I\s*O\s*N\s*)(.*?)(?=\s*J\.\:|PER\s*CURIAM\:)',
        case_text,
        re.DOTALL
    )
    if match:
        # Constructing the title to capture broader cases including different prefixes and spacing
        title = "[{}] {}".format(match.group(2), match.group(4).strip())
        return title
    
    return 'No Title Found'


# Apply the enhanced extraction function directly to each row in the 'content' column
df['case_title'] = df['content'].apply(extract_title_enhanced)


In [19]:
# Assuming 'df' has a 'case_title' column from previous extraction efforts

# Count identified and non-identified titles
identified_count = df[df['case_title'] != 'No Title Found'].shape[0]
no_title_count = df[df['case_title'] == 'No Title Found'].shape[0]

# Get samples for identified and non-identified titles
identified_samples = df[df['case_title'] != 'No Title Found']['case_title'].head(5).tolist()
no_title_samples = df[df['case_title'] == 'No Title Found']['content'].head(5).tolist()

# Displaying the results
print(f"Identified Titles: {identified_count}")
print(f"No Title Found: {no_title_count}\n")
print("Samples of Identified Titles:")
for sample in identified_samples:
    print(f"- {sample}")

print("\nSamples of Contents Without Identified Titles:")
for sample in no_title_samples:
    print(f"- {sample[:300]}")  # Show only the first 300 characters for readability

Identified Titles: 811
No Title Found: 102

Samples of Identified Titles:
- [ G.R. No. 6717. October 19, 1911 ] THE UNITED STATES, PLAINTIFF AND APPELLEE, VS. FAUSTINO MESINA, DEFENDANT AND APPELLANT. D E C I S I O N TORRES, J.:
- [ G.R. No. 6923. September 12, 1912 ] THE UNITED STATES, PLAINTIFF AND APPELLEE, VS. VALENTIN BERNABE, DEFENDANT AND APPELLANT. D E C I S I O N TORRES, J.:
- [ G.R. No. 1237. September 30, 1903 ] THE UNITED STATES, COMPLAINANT AND APPELLEE, VS. LEONARDO GUINACARAN ET AL., DEFENDANTS AND APPELLANTS. D E C I S I O N TORRES, J.:
- [ G.R. No. 6705. February 27, 1912 ] THE UNITED STATES, PLAINTIFF AND APPELLEE, VS. FELIPE SALVADOR (ALIAS APONG IPI), DEFENDANT AND APPELLANT. D E C I S I O N MORELAND, J.:
- [ G.R. No. L-12907. May 30, 1960 ] THE PEOPLE OF THE PHILIPPINES, PLAINTIFF AND APPELLEE, VS. MORO AMBAHANG, ET AL., DEFENDANTS. HANDAN AMID, INAMA MORSAN, NICOLAS CARPIO ALIAS COLAS AMBAHANG MORO, DEFENDANTS AND APPELLANTS. D E C I S I O N PER CURIAM:

Samples o

In [20]:
# Directly update 'case_title' for entries with 'No Title Found'
# Replace them with the first 300 characters of 'content', or with 'No Content Available' if content is NaN
df.loc[df['case_title'] == 'No Title Found', 'case_title'] = df.apply(lambda row: row['content'][:300] if pd.notna(row['content']) else 'No Content Available', axis=1)

# After updating, count how many titles are identified (which should now include those updated with the first 300 characters of content)
new_identified_count = df[df['case_title'].str.len() > 0].shape[0]
new_no_title_count = df[df['case_title'] == 'No Content Available'].shape[0]

# Displaying updated counts
print(f"Total Identified Titles (Including Updated): {new_identified_count}")
print(f"'No Content Available' Titles (where content was NaN): {new_no_title_count}\n")

# Displaying the first few rows to check updates
print("First few updated titles:")
print(df['case_title'].head(5).tolist())

Total Identified Titles (Including Updated): 913
'No Content Available' Titles (where content was NaN): 3

First few updated titles:
['[ G.R. No. 6717. October 19, 1911 ] THE UNITED STATES, PLAINTIFF AND APPELLEE, VS. FAUSTINO MESINA, DEFENDANT AND APPELLANT. D E C I S I O N TORRES, J.:', '[ G.R. No. 6923. September 12, 1912 ] THE UNITED STATES, PLAINTIFF AND APPELLEE, VS. VALENTIN BERNABE, DEFENDANT AND APPELLANT. D E C I S I O N TORRES, J.:', '[ G.R. No. 1237. September 30, 1903 ] THE UNITED STATES, COMPLAINANT AND APPELLEE, VS. LEONARDO GUINACARAN ET AL., DEFENDANTS AND APPELLANTS. D E C I S I O N TORRES, J.:', '[ G.R. No. 6705. February 27, 1912 ] THE UNITED STATES, PLAINTIFF AND APPELLEE, VS. FELIPE SALVADOR (ALIAS APONG IPI), DEFENDANT AND APPELLANT. D E C I S I O N MORELAND, J.:', '[ G.R. No. L-12907. May 30, 1960 ] THE PEOPLE OF THE PHILIPPINES, PLAINTIFF AND APPELLEE, VS. MORO AMBAHANG, ET AL., DEFENDANTS. HANDAN AMID, INAMA MORSAN, NICOLAS CARPIO ALIAS COLAS AMBAHANG MORO, DE

In [24]:
# Display the first few rows of the dataset to understand its structure
print("First few rows of the dataset:")
print(df.head(), "\n")  # Adjust the number inside head() as needed

# Count the number of titles that were not originally identified but now have been updated with content or marked as 'No Content Available'
updated_title_count = len(df[df['case_title'].str.len() <= 300])  # Assuming titles longer than 300 characters were already identified
no_content_count = len(df[df['case_title'] == 'No Content Available'])

# Count unique titles, excluding 'No Content Available'
unique_titles_count = len(df[df['case_title'] != 'No Content Available']['case_title'].unique())

# Check for any missing values across the entire DataFrame
total_missing_values = df.isnull().sum().sum()

# Summary
print(f"Total rows in dataset: {len(df)}")
print(f"Titles Updated with Content or Marked as 'No Content Available': {updated_title_count}")
print(f"Unique Titles (Excluding 'No Content Available'): {unique_titles_count}")
print(f"'No Content Available' Titles: {no_content_count}")
print(f"Total Missing Values Across All Columns: {total_missing_values}")

First few rows of the dataset:
                                               links  \
0  https://elibrary.judiciary.gov.ph/assets/dtSea...   
1  https://elibrary.judiciary.gov.ph/assets/dtSea...   
2  https://elibrary.judiciary.gov.ph/assets/dtSea...   
3  https://elibrary.judiciary.gov.ph/assets/dtSea...   
4  https://elibrary.judiciary.gov.ph/assets/dtSea...   

                                             content  \
0  21 Phil. 615 [ G.R. No. 6717. October 19, 1911...   
1  23 Phil. 154 [ G.R. No. 6923. September 12, 19...   
2  2 Phil. 551 [ G.R. No. 1237. September 30, 190...   
3  22 Phil. 113 [ G.R. No. 6705. February 27, 191...   
4  108 Phil. 325 [ G.R. No. L-12907. May 30, 1960...   

                                          case_title  
0  [ G.R. No. 6717. October 19, 1911 ] THE UNITED...  
1  [ G.R. No. 6923. September 12, 1912 ] THE UNIT...  
2  [ G.R. No. 1237. September 30, 1903 ] THE UNIT...  
3  [ G.R. No. 6705. February 27, 1912 ] THE UNITE...  
4  [ G.R. No. L-1290

In [26]:
df.to_csv("cases_w_meta.csv")