## Imports

In [192]:
# Numerical computing
import numpy as np

# Data manipulation
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning (scikit-learn)
import sklearn

In [193]:
df_documents = pd.read_csv("data/epstein-documents-2026-02-18.csv")
df_emails = pd.read_csv("data/epstein-emails-2026-02-18.csv")
df_flights = pd.read_csv("data/epstein-flights-2026-02-18.csv")
df_persons = pd.read_csv("data/epstein-persons-2026-02-18.csv")

## Data Cleaning and Preparation

### Epstein Documents

In [194]:
df_documents.head()

Unnamed: 0,ID,Title,Category,Source,Date,Pages,Summary,PDF URL,Tags
0,dc-26996597,Juan,deposition,court-unsealed,2026-02-13,4,Juuan |12-21-2s 6 Mouth ra Detentiou Nitaray l...,,
1,dc-26954377,Virginia-Giuffre-interview-Jeffrey-Epstein-2011,case-file,court-unsealed,2026-02-12,1,PREGED PURSUANT TO FS 766.205(4) and/or WORK P...,,
2,dc-26954378,Virginia-Giuffre-interview-Jeffrey-Epstein-201...,case-file,court-unsealed,2026-02-12,1,Edwards adv. Epstein Telephone interview wi ...,,
3,dc-26903588,CBP jeffrey epstein records 1,case-file,doj,2026-02-09,51,EN U.S. Customs and Border Protection 8 U.S. D...,,
4,dc-26903595,CBP jeffrey epstein records 3,case-file,doj,2026-02-09,174,Jeffrey Edward Epstein 0/20/1953 n MREZ TCT...,,


In [195]:
print("Number of rows :", df_documents.shape[0])
print("Number of cols :", df_documents.shape[1])

Number of rows : 50
Number of cols : 9


In [196]:
df_documents.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        50 non-null     object
 1   Title     50 non-null     object
 2   Category  50 non-null     object
 3   Source    50 non-null     object
 4   Date      50 non-null     object
 5   Pages     50 non-null     int64 
 6   Summary   22 non-null     object
 7   PDF URL   34 non-null     object
 8   Tags      29 non-null     object
dtypes: int64(1), object(8)
memory usage: 3.6+ KB


In [197]:
# find mising values
df_documents.isnull().sum()

ID           0
Title        0
Category     0
Source       0
Date         0
Pages        0
Summary     28
PDF URL     16
Tags        21
dtype: int64

Convert data types

In [198]:
# Convert to string 
# categorize columns by data type
str_cols = ["ID", "Title", "Summary", "PDF URL","Tags"]
df_documents_string = pd.DataFrame()
for c in str_cols:
    if c in df_documents.columns:
        df_documents[c] = df_documents[c].astype("string")
        df_documents_string[c] = df_documents[c]

# date column
if "Date" in df_documents.columns:
    df_documents["Date"] = pd.to_datetime(df_documents["Date"],errors ="coerce")
    df_documents_datetime = df_documents["Date"]

# Category columns
cat_cols =["Category", "Source"]
df_documents_cat = pd.DataFrame()
for c in cat_cols:
    if c in df_documents.columns:
        df_documents[c] = (
            df_documents[c]
            .astype("string")
            .str.strip()
            .str.lower()
            .astype("category")
        )
        df_documents_cat[c] = df_documents[c]

df_documents_numerical = df_documents["Pages"]

df_documents_cat.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Category  50 non-null     category
 1   Source    50 non-null     category
dtypes: category(2)
memory usage: 720.0 bytes


In [199]:
df_documents.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   ID        50 non-null     string        
 1   Title     50 non-null     string        
 2   Category  50 non-null     category      
 3   Source    50 non-null     category      
 4   Date      50 non-null     datetime64[ns]
 5   Pages     50 non-null     int64         
 6   Summary   22 non-null     string        
 7   PDF URL   34 non-null     string        
 8   Tags      29 non-null     string        
dtypes: category(2), datetime64[ns](1), int64(1), string(5)
memory usage: 3.4 KB


In [200]:
# Categorical Features

for c in df_documents_cat:
    print(f"\n{c}")
    print(df_documents_cat[c].value_counts(normalize=True))


Category
Category
other             0.60
case-file         0.22
deposition        0.04
evidence          0.04
fbi-report        0.04
correspondence    0.02
legal-filing      0.02
photo             0.02
Name: proportion, dtype: float64

Source
Source
doj               0.78
court-unsealed    0.22
Name: proportion, dtype: float64


### Epstein Emails

In [201]:
df_emails.head()

Unnamed: 0,Date,From,From Email,To,Subject,Body (excerpt),Tags
0,2019-07-14,Quora Digest [digest-noreply@quora.com],,jeevacation@gmail.com,Is Denmark going bankrupt?,Jeffrey's Digest\nTOP STORIES FOR YOU\n\nIs De...,house-oversight; muneeb-llm; src-IMAGES-008-HO...
1,2019-07-09,Intelligence Squared [info@intelligencesquared...,,jeevacation@gmail.com,"John Humphrys: The Terrier of Today, in conver...","John Humphrys: The Terrier of Today, in conver...",house-oversight; muneeb-llm; src-IMAGES-010-HO...
2,2019-07-07,Quora Digest [digest-noreply@quora.com],,jeevacation@gmail.com,What is the French paradox?,Jeffrey's Digest\n\nTOP STORIES FOR YOU\n\nWha...,house-oversight; muneeb-llm; src-IMAGES-007-HO...
3,2019-07-06,Reid Weingarten,rweingarten@steptoe.com,Jeffrey Epstein,Re:,Have a hysterical wynn-trump issue that will m...,
4,2019-07-06,Jeffrey Epstein,je@jeffreyepstein.com,Reid Weingarten,Re:,"dont know. ,. bannon for breakfast tomor. . ev...",


In [202]:
print("Number of rows :", df_emails.shape[0])
print("Number of cols :", df_emails.shape[1])

Number of rows : 9493
Number of cols : 7


In [203]:
df_emails.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9493 entries, 0 to 9492
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Date            9493 non-null   object
 1   From            9180 non-null   object
 2   From Email      3893 non-null   object
 3   To              8616 non-null   object
 4   Subject         9229 non-null   object
 5   Body (excerpt)  9493 non-null   object
 6   Tags            5293 non-null   object
dtypes: object(7)
memory usage: 519.3+ KB


In [204]:
# find mising values
df_emails.isnull().sum()

Date                 0
From               313
From Email        5600
To                 877
Subject            264
Body (excerpt)       0
Tags              4200
dtype: int64

In [205]:
str_cols = ["From",	"From Email",	"To",	"Subject",	"Body (excerpt)",	"Tags"]
df_emails_string = pd.DataFrame()
for c in str_cols:
    if c in df_emails.columns:
        df_emails[c] = df_emails[c].astype("string")
        df_emails_string[c] = df_emails[c]

# date column
if "Date" in df_emails.columns:
    df_emails["Date"] = pd.to_datetime(df_emails["Date"],errors ="coerce")
    df_emails_datetime = df_emails["Date"]

df_emails.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9493 entries, 0 to 9492
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Date            9493 non-null   datetime64[ns]
 1   From            9180 non-null   string        
 2   From Email      3893 non-null   string        
 3   To              8616 non-null   string        
 4   Subject         9229 non-null   string        
 5   Body (excerpt)  9493 non-null   string        
 6   Tags            5293 non-null   string        
dtypes: datetime64[ns](1), string(6)
memory usage: 519.3 KB


### Epstein Flights

In [206]:
df_flights.head()


Unnamed: 0,Date,Origin,Destination,Aircraft,Passengers,Pilot,Notes
0,2007-01-07,TIST Airport,EWR Airport,N908JE (Boeing 727),Jeffrey Epstein; Ghislaine Maxwell; Igor Zinov...,David Rodgers / Larry Visoski,Source: Pilot logbook (USA v. Maxwell trial ex...
1,2006-11-21,EWR Airport,"Albuquerque International Sunport, NM",N908JE (Boeing 727),Jeffrey Epstein; Jennifer Kalin; Nadia Marcink...,David Rodgers / Larry Visoski,Photo of Kathryn Kucka and Sarah Kellen at Ghi...
2,2006-11-20,TIST Airport,EWR Airport,N908JE (Boeing 727),Jeffrey Epstein; Jennifer Kalin; Igor Zinoviev...,David Rodgers / Larry Visoski,Source: Pilot logbook (USA v. Maxwell trial ex...
3,2007-01-12,EWR Airport,TIST Airport,N908JE (Boeing 727),Jeffrey Epstein; Igor Zinoviev; Walter Cronkit...,David Rodgers / Larry Visoski,Source: Pilot logbook (USA v. Maxwell trial ex...
4,2006-11-25,"Albuquerque International Sunport, NM",TIST Airport,N908JE (Boeing 727),Jeffrey Epstein; Igor Zinoviev; Nadia Marcinko...,David Rodgers / Larry Visoski,Source: Pilot logbook (USA v. Maxwell trial ex...


In [207]:
print("Number of rows :", df_flights.shape[0])
print("Number of cols :", df_flights.shape[1])

Number of rows : 1708
Number of cols : 7


In [208]:
df_flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1708 entries, 0 to 1707
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Date         1708 non-null   object
 1   Origin       1708 non-null   object
 2   Destination  1708 non-null   object
 3   Aircraft     1708 non-null   object
 4   Passengers   873 non-null    object
 5   Pilot        875 non-null    object
 6   Notes        1708 non-null   object
dtypes: object(7)
memory usage: 93.5+ KB


In [209]:
# find mising values
df_flights.isnull().sum()

Date             0
Origin           0
Destination      0
Aircraft         0
Passengers     835
Pilot          833
Notes            0
dtype: int64

In [210]:
str_cols = ["Origin","Destination","Aircraft","Passengers","Pilot","Notes" ]
df_flights_string = pd.DataFrame()
for c in str_cols:
    if c in df_flights.columns:
        df_flights[c] = df_flights[c].astype("string")
        df_flights_string[c] = df_flights[c]

# date column
if "Date" in df_flights.columns:
    df_flights["Date"] = pd.to_datetime(df_flights["Date"],errors ="coerce")
    df_flights_datetime = df_flights["Date"]

df_flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1708 entries, 0 to 1707
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Date         1708 non-null   datetime64[ns]
 1   Origin       1708 non-null   string        
 2   Destination  1708 non-null   string        
 3   Aircraft     1708 non-null   string        
 4   Passengers   873 non-null    string        
 5   Pilot        875 non-null    string        
 6   Notes        1708 non-null   string        
dtypes: datetime64[ns](1), string(6)
memory usage: 93.5 KB


### Epstein Persons

In [211]:
df_persons.head()

Unnamed: 0,Name,Category,Bio,Aliases,Flights,Documents,Connections,In Black Book,Nationality
0,Jeffrey Epstein,other,Convicted sex offender; financier; died in cus...,Jeffrey Edward Epstein,553,520389,166,No,American
1,Ghislaine Maxwell,associate,Convicted sex trafficker; Epstein's closest as...,G-Max,413,11079,154,Yes,British
2,Sarah Kellen,associate,Epstein's primary scheduler; named co-conspira...,Sarah Kellen Vickers; Sarah Lyn Kensington,364,477,82,Yes,American
3,Igor Zinoviev,associate,Russian MMA fighter (UFC veteran); Epstein bod...,,19,81,47,Yes,Russian-American
4,Larry Visoski,associate,Epstein's chief pilot for 28 years; key Maxwel...,Lawrence Visoski,281,6131,43,Yes,American


In [212]:
print("Number of rows :", df_persons.shape[0])
print("Number of cols :", df_persons.shape[1])

Number of rows : 1505
Number of cols : 9


In [213]:
df_persons.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1505 entries, 0 to 1504
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Name           1505 non-null   object
 1   Category       1505 non-null   object
 2   Bio            1504 non-null   object
 3   Aliases        322 non-null    object
 4   Flights        1505 non-null   int64 
 5   Documents      1505 non-null   int64 
 6   Connections    1505 non-null   int64 
 7   In Black Book  1505 non-null   object
 8   Nationality    651 non-null    object
dtypes: int64(3), object(6)
memory usage: 105.9+ KB


In [214]:
# find mising values
df_persons.isnull().sum()

Name                0
Category            0
Bio                 1
Aliases          1183
Flights             0
Documents           0
Connections         0
In Black Book       0
Nationality       854
dtype: int64

In [215]:
string_cols = [
    "Name",
    "Bio",
    "Aliases",
    "Flights",
    "Documents",
    "Connections"
]

# Convert string columns
for col in string_cols:
    if col in df_persons.columns:
        df_persons[col] = df_persons[col].astype("string")

category_cols = [
    "Category",
    "Nationality",
    "In Black Book"
]

df_persons_cat = pd.DataFrame()   # ‚Üê FIXED

for col in category_cols:
    if col in df_persons.columns:
        df_persons[col] = (
            df_persons[col]
            .astype("string")
            .str.strip()
            .str.lower()
            .astype("category")
        )
        df_persons_cat[col] = df_persons[col]

df_persons.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1505 entries, 0 to 1504
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   Name           1505 non-null   string  
 1   Category       1505 non-null   category
 2   Bio            1504 non-null   string  
 3   Aliases        322 non-null    string  
 4   Flights        1505 non-null   string  
 5   Documents      1505 non-null   string  
 6   Connections    1505 non-null   string  
 7   In Black Book  1505 non-null   category
 8   Nationality    651 non-null    category
dtypes: category(3), string(6)
memory usage: 78.2 KB


In [216]:
# Categorical Features

for c in df_persons_cat:
    print(f"\n{c}")
    print(df_persons_cat[c].value_counts(normalize=True))


Category
Category
associate                0.553488
business                 0.125581
other                    0.076412
celebrity                0.065781
politician               0.065116
academic                 0.041860
legal                    0.035880
socialite                0.023920
royalty                  0.007309
military-intelligence    0.004651
Name: proportion, dtype: float64

Nationality
Nationality
american              0.669739
british               0.093702
french                0.023041
norwegian             0.016897
italian               0.013825
                        ...   
italian-french        0.001536
japanese-american     0.001536
kazakhstani           0.001536
brazilian-american    0.001536
yemeni                0.001536
Name: proportion, Length: 74, dtype: float64

In Black Book
In Black Book
no     0.778738
yes    0.221262
Name: proportion, dtype: float64
