In [4]:
#Install pyliftover if you haven’t yet:
!pip install pyliftover


# Chromatin State Analysis
!pip install pybedtools


# downalod the liftover bedfile

!wget https://hgdownload.soe.ucsc.edu/gbdb/rheMac10/liftOver/rheMac10ToHg38.over.chain.gz

--2024-10-07 18:50:45--  https://hgdownload.soe.ucsc.edu/gbdb/rheMac10/liftOver/rheMac10ToHg38.over.chain.gz
Resolving hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)... 128.114.119.163
Connecting to hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)|128.114.119.163|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 36912007 (35M) [application/x-gzip]
Saving to: ‘rheMac10ToHg38.over.chain.gz’


2024-10-07 18:50:58 (3.08 MB/s) - ‘rheMac10ToHg38.over.chain.gz’ saved [36912007/36912007]



In [7]:
import pandas as pd

# Load the data
file_path = 'global_bycluster_all_regions.csv'
data = pd.read_csv(file_path)

# Filter regions by p-value (adjust the threshold as needed)
p_value_threshold = 0.05
filtered_data = data[data['p_value'] < p_value_threshold]

# Save the filtered data for further processing
filtered_file_path = 'filtered_regions.csv'
filtered_data.to_csv(filtered_file_path, index=False)


#Convert the coordinates in Python:

from pyliftover import LiftOver


# Load the chain file
lo = LiftOver('rheMac10ToHg38.over.chain.gz')

# Create a list to store human coordinates
human_coords = []

# Iterate over filtered regions and perform lift-over
for idx, row in filtered_data.iterrows():
    chrom = row['chromosome']
    start = row['start']
    end = row['end']

    # Convert each position
    human_start = lo.convert_coordinate(chrom, start)
    human_end = lo.convert_coordinate(chrom, end)

    if human_start and human_end:
        human_coords.append([human_start[0][0], human_start[0][1], human_end[0][1]])
    else:
        print(f"Region {chrom}:{start}-{end} could not be lifted over.")

# Create a DataFrame for the lifted regions
lifted_df = pd.DataFrame(human_coords, columns=['chromosome', 'start', 'end'])
lifted_df.to_csv('lifted_regions.csv', index=False)

# download chromatin state file

#https://www.encodeproject.org/files/ENCFF001XFG/@@download/ENCFF001XFG.bed.gz


import requests

# URL of the chromatin state annotation file
url = 'https://www.encodeproject.org/files/ENCFF001XFG/@@download/ENCFF001XFG.bed.gz'

# Path to save the file
output_path = 'chromatin_state_annotations.bed.gz'

# Download the file
response = requests.get(url, stream=True)
with open(output_path, 'wb') as file:
    for chunk in response.iter_content(chunk_size=1024):
        if chunk:
            file.write(chunk)

print("Chromatin state annotation file downloaded successfully.")


#unzip the file 

import gzip
import shutil

# Unzip the downloaded file
with gzip.open('chromatin_state_annotations.bed.gz', 'rb') as f_in:
    with open('chromatin_state_annotations.bed', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

print("File unzipped successfully.")



import pybedtools

# Load lifted regions as a BedTool object
lifted_bed = pybedtools.BedTool.from_dataframe(lifted_df)

# Load chromatin state annotations (assuming it's in BED format)
chromatin_file = 'chromatin_state_annotations.bed'
chromatin_bed = pybedtools.BedTool(chromatin_file)

# Intersect lifted regions with chromatin states
intersected = lifted_bed.intersect(chromatin_bed, wa=True, wb=True)

# Convert the result to a DataFrame
intersected_df = intersected.to_dataframe(names=['chrom', 'start', 'end', 'chromatin_state'])

# Save the chromatin state annotated regions
intersected_df.to_csv('chromatin_state_annotated_regions.csv', index=False)


KeyError: 'end'

In [8]:
# the same code to fix the error;
import pandas as pd
from pyliftover import LiftOver
import requests
import gzip
import shutil
import pybedtools

# Step 1: Load the data and filter by p-value
file_path = 'global_bycluster_all_regions.csv'
data = pd.read_csv(file_path)
p_value_threshold = 0.05
filtered_data = data[data['p_value'] < p_value_threshold]

# Save the filtered data
filtered_file_path = 'filtered_regions.csv'
filtered_data.to_csv(filtered_file_path, index=False)

# Step 2: Perform liftover from rhesus macaque to human
lo = LiftOver('rheMac10ToHg38.over.chain.gz')
human_coords = []

for idx, row in filtered_data.iterrows():
    chrom = row['chromosome']
    start = row['start']
    end = row['end']
    
    # Convert start and end coordinates
    human_start = lo.convert_coordinate(chrom, start)
    human_end = lo.convert_coordinate(chrom, end)
    
    if human_start and human_end:
        human_coords.append([human_start[0][0], human_start[0][1], human_end[0][1]])
    else:
        print(f"Region {chrom}:{start}-{end} could not be lifted over.")

# Save the lifted coordinates to a DataFrame and output as CSV
lifted_df = pd.DataFrame(human_coords, columns=['chromosome', 'start', 'end'])
lifted_df.to_csv('lifted_regions.csv', index=False)

# Step 3: Download and unzip the chromatin state annotation file
url = 'https://www.encodeproject.org/files/ENCFF001XFG/@@download/ENCFF001XFG.bed.gz'
output_path = 'chromatin_state_annotations.bed.gz'
response = requests.get(url, stream=True)

with open(output_path, 'wb') as file:
    for chunk in response.iter_content(chunk_size=1024):
        if chunk:
            file.write(chunk)
print("Chromatin state annotation file downloaded successfully.")

# Unzip the chromatin state file
with gzip.open(output_path, 'rb') as f_in:
    with open('chromatin_state_annotations.bed', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
print("File unzipped successfully.")

# Step 4: Intersect lifted regions with chromatin state annotations
lifted_bed = pybedtools.BedTool.from_dataframe(lifted_df)
chromatin_bed = pybedtools.BedTool('chromatin_state_annotations.bed')
intersected = lifted_bed.intersect(chromatin_bed, wa=True, wb=True)

# Convert the intersected result to DataFrame and save
intersected_df = intersected.to_dataframe(names=['chrom', 'start', 'end', 'chromatin_state'])
intersected_df.to_csv('chromatin_state_annotated_regions.csv', index=False)
print("Chromatin state annotated regions saved successfully.")


KeyError: 'end'