In [1]:
%%bash
dx download "UKB Metabolomics:/agarham/gp_prescription_records.tsv"

In [2]:
# Import the libraries
import pandas as pd
import numpy as np

# Import the dataset
df = pd.read_csv('gp_prescription_records.tsv', sep='\t')

# Change the issue date data type to datetime
df['issue_date'] = pd.to_datetime(df['issue_date'])

In [3]:
# List of read codes to search for
search_readcodes = [
    'm5A', 'h87', 'h8Q', 'h8G', 'h82', 'h89',
    'h55', 'h3G', 'h34', 'hha', 'h8E', 'm55'
]

# Create a new column 'read_match' with 1 if code starts with any in the list, else 0
df['read_match'] = df['read_2'].str.startswith(tuple(search_readcodes), na=False).astype(int)

In [None]:
import re

# List of prefixes
prefixes = [
    'acitretin',
    'adalimumab',
    'apremilast',
    'capimune',
    'ciclosporin',
    'cimzia',
    'etanercept',
    'fumaderm',
    'fumaric',
    'humira',
    'hydrea',
    'hydroxycarbamide',
    'hydroxyurea',
    'maxtrex',
    'methotrexate',
    'metoject',
    'neoral',
    'neotigason',
    'ruxolitinib',
    'sandimmun',
    'stelara',
    'tigason',
    'tofacitinib',
    'ustekinumab'
]

# List of exclusion terms (also checked case-insensitively)
exclusions = ['eye', 'oral']

# Regex pattern to match prefixes at start (case-insensitive)
prefix_pattern = r'^(' + '|'.join(prefixes) + ')'
# Regex pattern to match any exclusion term (case-insensitive)
exclusion_pattern = '|'.join(exclusions)

# Apply logic
df['drug_match'] = (
    df['drug_name'].fillna('').str.contains(prefix_pattern, flags=re.IGNORECASE, regex=True) &
    ~df['drug_name'].fillna('').str.contains(exclusion_pattern, flags=re.IGNORECASE, regex=True)
).astype(int)

# Check result
print(df['read_match'].value_counts())
print(df['drug_match'].value_counts())

  df['drug_name'].fillna('').str.contains(prefix_pattern, flags=re.IGNORECASE, regex=True) &


In [None]:
# Combine read and drug name search result into one column
df['gp_meds'] = (
    (df['read_match'] == 1) | (df['drug_match'] == 1)
).astype(int)

# Check result
print(df['gp_meds'].value_counts())

# Slice only to those receiving the listed drugs, which issue date is not missing
df2 = df[(df['gp_meds'] ==1) & (df['issue_date'].notna())]

# Check result
print(len(df2))

In [None]:
# Check the df
df2.head()

In [None]:
# Save the file
df2.to_csv('severity_gprecords.tsv', sep='\t', index=False)

In [None]:
%%bash
dx upload "severity_gprecords.tsv"