In [None]:
!apt-get update

Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Get:10 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [3,255 kB]
Get:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease [24.6 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [4,540 kB]
Get:13 http://archive.ubuntu.com/ubu

In [None]:
!apt-get install  -y fastqc
!apt-get install -y sra-toolkit
import subprocess


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
fastqc is already the newest version (0.11.9+dfsg-5).
0 upgraded, 0 newly installed, 0 to remove and 93 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
sra-toolkit is already the newest version (2.11.3+dfsg-1ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 93 not upgraded.


In [None]:
# Download SRA file (replace SRRXXXXXXX with your SRA run accession)
#!prefetch SRRXXXXXXX

# Convert to FASTQ (single-end example)
#!fastq-dump --split-files SRRXXXXXXX.sra


In [None]:
# Download SRA file (replace SRRXXXXXXX with your SRA run accession)
!prefetch SRR9879594

# Convert to FASTQ (single-end example)
#!fastq-dump --split-files SRRXXXXXXX.sra



2025-05-25T07:08:27 prefetch.2.11.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.
2025-05-25T07:08:27 prefetch.2.11.3: 1) Downloading 'SRR9879594'...
2025-05-25T07:08:27 prefetch.2.11.3: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability.
2025-05-25T07:08:27 prefetch.2.11.3:  Downloading via HTTPS...
2025-05-25T07:08:45 prefetch.2.11.3:  HTTPS download succeed
2025-05-25T07:08:50 prefetch.2.11.3:  'SRR9879594' is valid
2025-05-25T07:08:50 prefetch.2.11.3: 1) 'SRR9879594' was downloaded successfully


In [None]:
import subprocess
from pathlib import Path

def download_sra_to_fastq(sra_accession, split_files=True, max_spots=None):
    """
    Download SRA data and convert to FASTQ using SRA Toolkit commands.

    Args:
        sra_accession (str): Valid SRA run accession (e.g., 'SRR11605094')
        split_files (bool): Split paired-end reads into separate files
        max_spots (int or None): Limit number of spots to download

    Returns:
        bool: True if successful, False otherwise
    """
    try:
        # Validate accession format
        if not sra_accession.startswith(('SRR', 'ERR', 'DRR')):
            raise ValueError("Accession must start with SRR, ERR, or DRR")

        # Create SRA directory structure
        sra_dir = Path.home() / "ncbi" / "public" / "sra"
        sra_dir.mkdir(parents=True, exist_ok=True)

        # 1. Download with prefetch (with retries)
        print(f"📥 Downloading {sra_accession}...")
        for attempt in range(3):  # Retry up to 3 times
            try:
                subprocess.run(
                    ['prefetch', sra_accession, '-O', str(sra_dir)],
                    check=True,
                    stderr=subprocess.PIPE
                )
                break
            except subprocess.CalledProcessError as e:
                if attempt == 2:
                    raise
                print(f"⚠️ Retry {attempt + 1}/3...")

        # 2. Verify download
        sra_path = sra_dir / f"{sra_accession}.sra"
        if not sra_path.exists():
            raise FileNotFoundError(f"SRA file not found at {sra_path}")

        # 3. Convert to FASTQ
        print(f"⚙️ Converting to FASTQ...")
        fastq_cmd = [
            'fasterq-dump',
            '--outdir', '.',
            '--threads', '4',
            '--verbose'
        ]

        if split_files:
            fastq_cmd.append('--split-files')
        if max_spots:
            fastq_cmd.extend(['--max-spot-id', str(max_spots)])

        fastq_cmd.append(str(sra_path))

        subprocess.run(fastq_cmd, check=True)

        # 4. Cleanup
        sra_path.unlink()
        print(f"✅ Successfully processed {sra_accession}")
        return True

    except Exception as e:
        print(f"❌ Failed to process {sra_accession}: {str(e)}")
        return False

In [None]:
import subprocess
import os

def download_rnaseq_sra(srr_id, out_dir='.'):
    """
    Download RNA-seq data from SRA using an SRR code.
    Requires: SRA Toolkit installed and in PATH.

    Args:
        srr_id (str): The SRR accession code (e.g., 'SRR9879594').
        out_dir (str): Directory to save the downloaded files.
    """
    # Ensure output directory exists
    os.makedirs(out_dir, exist_ok=True)

    # Step 1: Download the .sra file using prefetch
    print(f"Downloading SRA file for {srr_id}...")
    subprocess.run(['prefetch', srr_id, '--output-directory', out_dir], check=True)

    # Step 2: Convert .sra to .fastq using fasterq-dump
    sra_path = os.path.join(out_dir, srr_id)
    print(f"Converting {srr_id} to FASTQ...")
    subprocess.run(['fasterq-dump', sra_path, '-O', out_dir], check=True)

    print(f"Download and conversion complete. Files saved in {out_dir}")

# Example usage:
# download_rnaseq_sra('SRR9879594', out_dir='./rna_seq_data')


#donwload RNA-seq data

In [None]:
list_sra=[]
num=9879602
for i in range(4):
  ssr="SRR"
  srrt= ssr+str(num)
  list_sra.append(srrt)
  num+=1

print(list_sra)
for srr_id in list_sra:
  download_rnaseq_sra(srr_id, out_dir='sra-tomato-4')


['SRR9879602', 'SRR9879603', 'SRR9879604', 'SRR9879605']
Downloading SRA file for SRR9879602...
Converting SRR9879602 to FASTQ...
Download and conversion complete. Files saved in sra-tomato-4
Downloading SRA file for SRR9879603...
Converting SRR9879603 to FASTQ...
Download and conversion complete. Files saved in sra-tomato-4
Downloading SRA file for SRR9879604...
Converting SRR9879604 to FASTQ...
Download and conversion complete. Files saved in sra-tomato-4
Downloading SRA file for SRR9879605...
Converting SRR9879605 to FASTQ...
Download and conversion complete. Files saved in sra-tomato-4


In [None]:


def download_sra_to_fastq(sra_accession, split_files=True, max_spots=None):
    """
    Download SRA data and convert to FASTQ using SRA Toolkit commands.

    Args:
        sra_accession (str): The SRA run accession (e.g., 'SRR1234567').
        split_files (bool): Whether to split paired-end reads into separate files.
        max_spots (int or None): Limit number of reads downloaded (None = no limit).

    Returns:
        None. Downloads files to current working directory.
    """
    try:
        # Install SRA Toolkit (skip if already installed)
        print("Installing SRA Toolkit...")
        subprocess.run(['apt-get', 'update'], check=True)
        subprocess.run(['apt-get', 'install', '-y', 'sra-toolkit'], check=True)

        # Download SRA file using prefetch
        print(f"Downloading SRA file for {sra_accession}...")
        subprocess.run(['prefetch', sra_accession], check=True)

        # Build fastq-dump command
        fastq_cmd = ['fastq-dump']
        if split_files:
            fastq_cmd.append('--split-files')
        if max_spots is not None:
            fastq_cmd.extend(['--maxSpotId', str(max_spots)])
        fastq_cmd.append(f'{sra_accession}.sra')

        # Convert SRA to FASTQ
        print(f"Converting {sra_accession}.sra to FASTQ...")
        subprocess.run(fastq_cmd, check=True)

        print("Download and conversion completed successfully.")

    except subprocess.CalledProcessError as e:
        print(f"Error occurred: {e}")

# Example usage:
download_sra_to_fastq('SRP217045', split_files=True, max_spots=100000)


Installing SRA Toolkit...
Downloading SRA file for SRP217045...
Error occurred: Command '['prefetch', 'SRP217045']' returned non-zero exit status 64.


In [None]:
!fastqc /content/sra-tomato-4/SRR9879602.fastq

Started analysis of SRR9879602.fastq
Approx 5% complete for SRR9879602.fastq
Approx 10% complete for SRR9879602.fastq
Approx 15% complete for SRR9879602.fastq
Approx 20% complete for SRR9879602.fastq
Approx 25% complete for SRR9879602.fastq
Approx 30% complete for SRR9879602.fastq
Approx 35% complete for SRR9879602.fastq
Approx 40% complete for SRR9879602.fastq
Approx 45% complete for SRR9879602.fastq
Approx 50% complete for SRR9879602.fastq
Approx 55% complete for SRR9879602.fastq
Approx 60% complete for SRR9879602.fastq
Approx 65% complete for SRR9879602.fastq
Approx 70% complete for SRR9879602.fastq
Approx 75% complete for SRR9879602.fastq
Approx 80% complete for SRR9879602.fastq
Approx 85% complete for SRR9879602.fastq
Approx 90% complete for SRR9879602.fastq
Approx 95% complete for SRR9879602.fastq
Analysis complete for SRR9879602.fastq


In [None]:
!apt-get install -y fastp

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  fastp
0 upgraded, 1 newly installed, 0 to remove and 91 not upgraded.
Need to get 193 kB of archives.
After this operation, 640 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 fastp amd64 0.20.1+dfsg-1 [193 kB]
Fetched 193 kB in 0s (598 kB/s)
Selecting previously unselected package fastp.
(Reading database ... 126102 files and directories currently installed.)
Preparing to unpack .../fastp_0.20.1+dfsg-1_amd64.deb ...
Unpacking fastp (0.20.1+dfsg-1) ...
Setting up fastp (0.20.1+dfsg-1) ...
Processing triggers for man-db (2.10.2-1) ...


In [None]:
# Trim single-end FASTQ file
!fastp -i input.fastq -o trimmed_output.fastq -h fastp_report.html -j fastp_report.json

/bin/bash: line 1: fastp: command not found


In [None]:
# Trim single-end FASTQ file
!fastp -i /content/SRR2016724_1.fastq -o SRR2016724_1.fastq -h fastp_report.html -j fastp_report.json

/bin/bash: line 1: fastp: command not found


In [None]:
#Trim paired-end FASTQ files
!fastp -i input_1.fastq -I input_2.fastq -o trimmed_1.fastq -O trimmed_2.fastq -h fastp_report.html -j fastp_report.json

In [None]:
!wget   https://ftp.ensembl.org/pub/release-114/fasta/mus_musculus/dna/Mus_musculus.GRCm39.dna.toplevel.fa.gz

--2025-05-21 05:26:14--  https://ftp.ensembl.org/pub/release-114/fasta/mus_musculus/dna/Mus_musculus.GRCm39.dna.toplevel.fa.gz
Resolving ftp.ensembl.org (ftp.ensembl.org)... 193.62.193.169
Connecting to ftp.ensembl.org (ftp.ensembl.org)|193.62.193.169|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 806418890 (769M) [application/x-gzip]
Saving to: ‘Mus_musculus.GRCm39.dna.toplevel.fa.gz’


2025-05-21 05:26:40 (29.8 MB/s) - ‘Mus_musculus.GRCm39.dna.toplevel.fa.gz’ saved [806418890/806418890]



###tomato ref

In [None]:
!wget https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-61/fasta/solanum_lycopersicum/dna/Solanum_lycopersicum.SL3.0.dna.toplevel.fa.gz

--2025-05-27 06:50:35--  https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-61/fasta/solanum_lycopersicum/dna/Solanum_lycopersicum.SL3.0.dna.toplevel.fa.gz
Resolving ftp.ensemblgenomes.ebi.ac.uk (ftp.ensemblgenomes.ebi.ac.uk)... 193.62.193.161
Connecting to ftp.ensemblgenomes.ebi.ac.uk (ftp.ensemblgenomes.ebi.ac.uk)|193.62.193.161|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 227747063 (217M) [application/x-gzip]
Saving to: ‘Solanum_lycopersicum.SL3.0.dna.toplevel.fa.gz’


2025-05-27 06:50:53 (13.2 MB/s) - ‘Solanum_lycopersicum.SL3.0.dna.toplevel.fa.gz’ saved [227747063/227747063]



In [None]:
!gunzip /content/Solanum_lycopersicum.SL3.0.dna.toplevel.fa.gz

In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()

⏬ Downloading https://github.com/jaimergp/miniforge/releases/download/24.11.2-1_colab/Miniforge3-colab-24.11.2-1_colab-Linux-x86_64.sh...
📦 Installing...
📌 Adjusting configuration...
🩹 Patching environment...
⏲ Done in 0:00:11
🔁 Restarting kernel...


In [None]:
!wget https://cloud.biohpc.swmed.edu/index.php/s/hisat2-220-download/download -O hisat2.zip
!unzip hisat2.zip

--2025-05-25 05:20:53--  https://cloud.biohpc.swmed.edu/index.php/s/hisat2-220-download/download
Resolving cloud.biohpc.swmed.edu (cloud.biohpc.swmed.edu)... 129.112.9.92
Connecting to cloud.biohpc.swmed.edu (cloud.biohpc.swmed.edu)|129.112.9.92|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2025-05-25 05:20:53 ERROR 404: Not Found.

Archive:  hisat2.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of hisat2.zip or
        hisat2.zip.zip, and cannot find hisat2.zip.ZIP, period.


In [None]:
!conda install -c bioconda hisat2 -y

Channels:
 - bioconda
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ done
Solving environment: / - \ done

## Package Plan ##

  environment location: /usr/local

  added / updated specs:
    - hisat2


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2025.4.26  |       hbd8a1cb_0         149 KB  conda-forge
    certifi-2025.4.26          |     pyhd8ed1ab_0         154 KB  conda-forge
    conda-24.11.3              |  py311h38be061_0         1.1 MB  conda-forge
    hisat2-2.2.1               |       h503566f_8        15.9 MB  bioconda
    openssl-3.5.0              |       h7b32b05_1         3.0 MB  conda-forge
    perl-5.32.1                | 7_hd590300_perl5        12.7 MB  conda-forg

In [None]:
!sudo apt-get update
!sudo apt-get -y install hisat2


0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,683 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,729 kB]
Get:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Get:11 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,245 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 k

# make ref indexs

In [None]:
!gunzip /content/Solanum_lycopersicum.SL3.0.dna.toplevel.fa.gz


In [None]:
!hisat2-build -p 8 /content/Solanum_lycopersicum.SL3.0.dna.toplevel.fa reference_index


Settings:
  Output files: "reference_index.*.ht2"
  Line rate: 6 (line is 64 bytes)
  Lines per side: 1 (side is 64 bytes)
  Offset rate: 4 (one in 16)
  FTable chars: 10
  Strings: unpacked
  Local offset rate: 3 (one in 8)
  Local fTable chars: 6
  Local sequence length: 57344
  Local sequence overlap between two consecutive indexes: 1024
  Endianness: little
  Actual local endianness: little
  Sanity checking: disabled
  Assertions: disabled
  Random seed: 0
  Sizeofs: void*:8, int:4, long:8, size_t:8
Input files DNA, FASTA:
  /content/Solanum_lycopersicum.SL3.0.dna.toplevel.fa
Reading reference sizes
  Time reading reference sizes: 00:00:12
Calculating joined length
Writing header
Reserving space for joined string
Joining reference sequences
  Time to join reference sequences: 00:00:07
  Time to read SNPs and splice sites: 00:00:00
Using parameters --bmax 17492390 --dcv 1024
  Doing ahead-of-time memory usage test
  Passed!  Constructing with these parameters: --bmax 17492390 --dcv

#maping

In [None]:
#p
!hisat2 -p 4 -x reference_index -1 reads_1.fastq -2 reads_2.fastq -S output.sam --summary-file summary.txt

(ERR): Read file 'reads_1.fastq' doesn't exist
Exiting now ...


In [None]:
#s
!hisat2 -p 4 -x reference_index -U reads.fastq -S output.sam --summary-file summary.txt


(ERR): Read file 'reads.fastq' doesn't exist
Exiting now ...


In [None]:
!mkdir random_name

mkdir: cannot create directory ‘random_name’: File exists


In [None]:
!gunzip /content/Solanum_lycopersicum.SL3.0.dna.toplevel.fa.gz

gzip: /content/Solanum_lycopersicum.SL3.0.dna.toplevel.fa.gz: No such file or directory


In [None]:
!hisat2-build /content/Solanum_lycopersicum.SL3.0.dna.toplevel.fa reference_index

/bin/bash: line 1: hisat2-build: command not found


#sam bam

In [None]:
# prompt: i have 8 index.ht2 file and 4 .fastq file i want to make .sam fille with hsat2

import glob

# Get a list of all fastq files in the current directory
fastq_files = glob.glob('*.fastq')

# Assuming paired-end reads and index files are in the same directory
index_prefix = 'reference_index' # Replace with the actual prefix if different

# Iterate through fastq files and run hisat2
for fastq_file in fastq_files:
    # Construct output SAM file name
    output_sam = fastq_file.replace('.fastq', '.sam')

    # Determine if it's a paired-end read (assuming _1.fastq and _2.fastq)
    if '_1.fastq' in fastq_file:
        read2_file = fastq_file.replace('_1.fastq', '_2.fastq')
        if read2_file in fastq_files:
            print(f"Mapping paired-end reads: {fastq_file} and {read2_file} to {output_sam}")
            !hisat2 -p 4 -x {index_prefix} -1 {fastq_file} -2 {read2_file} -S {output_sam} --summary-file {output_sam.replace('.sam', '_summary.txt')}
            # Remove the corresponding _2.fastq from the list to avoid double processing
            fastq_files.remove(read2_file)
        else:
            print(f"Warning: Found {fastq_file} but no matching paired-end file.")
            print(f"Mapping single-end read: {fastq_file} to {output_sam}")
            !hisat2 -p 4 -x {index_prefix} -U {fastq_file} -S {output_sam} --summary-file {output_sam.replace('.sam', '_summary.txt')}
    elif '_2.fastq' in fastq_file:
         # This file will be processed when its _1.fastq counterpart is encountered
         pass # Do nothing, it will be handled by the _1.fastq logic
    else:
        # Assume it's a single-end read if it doesn't have _1 or _2
        print(f"Mapping single-end read: {fastq_file} to {output_sam}")
        !hisat2 -p 4 -x {index_prefix} -U {fastq_file} -S {output_sam} --summary-file {output_sam.replace('.sam', '_summary.txt')}

print("HISAT2 mapping complete for all fastq files.")



In [None]:
!ls -lh *.ht2

-rw-r--r-- 1 root root 242M May 25 05:52 reference_index.1.ht2
-rw-r--r-- 1 root root 178M May 25 05:52 reference_index.2.ht2
-rw-r--r-- 1 root root 201K May 25 05:24 reference_index.3.ht2
-rw-r--r-- 1 root root 178M May 25 05:24 reference_index.4.ht2
-rw-r--r-- 1 root root 341M May 25 05:54 reference_index.5.ht2
-rw-r--r-- 1 root root 182M May 25 05:54 reference_index.6.ht2
-rw-r--r-- 1 root root   12 May 25 05:25 reference_index.7.ht2
-rw-r--r-- 1 root root    8 May 25 05:25 reference_index.8.ht2


In [None]:
!mkdir -p refgene
!mv *.ht2 refgene

In [None]:
!hisat2 -x /content/reference_index/*.ht2 -U /content/sra-tomato-4/*.fastq -S combined_output.sam


(ERR): "/content/reference_index/reference_index.1.ht2" does not exist
Exiting now ...


In [None]:
list_sra=[]
num=9879602
for i in range(4):
  l=f"SRR{num}.fastq"
  list_sra.append(l)
  num+=1
print(list_sra)


['SRR9879602.fastq', 'SRR9879603.fastq', 'SRR9879604.fastq', 'SRR9879605.fastq']


In [None]:
!hisat2 -x /content/reference_index/reference_index \
        -U /content/sra-tomato-4/SRR9879602.fastq\
        -S /content/combined_output.sam

17401320 reads; of these:
  17401320 (100.00%) were unpaired; of these:
    1936793 (11.13%) aligned 0 times
    15274069 (87.78%) aligned exactly 1 time
    190458 (1.09%) aligned >1 times
88.87% overall alignment rate


In [None]:
!ls -lh /content/combined_output.sam  # Verify SAM file was created
!head /content/combined_output.sam    # Peek at alignment results

-rw-r--r-- 1 root root 6.0G May 25 11:44 /content/combined_output.sam
@HD	VN:1.0	SO:unsorted
@SQ	SN:1	LN:98455869
@SQ	SN:2	LN:55977580
@SQ	SN:3	LN:72290146
@SQ	SN:4	LN:66557038
@SQ	SN:5	LN:66723567
@SQ	SN:6	LN:49794276
@SQ	SN:7	LN:68175699
@SQ	SN:8	LN:65987440
@SQ	SN:9	LN:72906345


In [None]:
!samtools view -b /content/combined_output.sam > /content/combined_output.bam

In [None]:
!# Sort BAM by genomic coordinates
!samtools sort -@ 4 -o sample.sorted.bam /content/combined_output.bam

# Index the sorted BAM
!samtools index sample.sorted.bam

[bam_sort_core] merging from 4 files and 4 in-memory blocks...


In [None]:
!pip install HTSeq  # Install HTSeq Python package
!htseq-count --help # Verify installation

Collecting HTSeq
  Downloading HTSeq-2.0.9-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Collecting numpy (from HTSeq)
  Downloading numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting pysam (from HTSeq)
  Downloading pysam-0.23.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.6 kB)
Downloading HTSeq-2.0.9-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m126.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pysam-0.23.0-cp311-cp311-manylinux_2_28_x86_64.whl (26.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.0/26.0 MB[0m [31m127.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!wget  https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-61/gff3/solanum_lycopersicum/Solanum_lycopersicum.SL3.0.61.chr.gff3.gz

--2025-05-25 12:27:08--  https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-61/gff3/solanum_lycopersicum/Solanum_lycopersicum.SL3.0.61.chr.gff3.gz
Resolving ftp.ensemblgenomes.ebi.ac.uk (ftp.ensemblgenomes.ebi.ac.uk)... 193.62.193.161
Connecting to ftp.ensemblgenomes.ebi.ac.uk (ftp.ensemblgenomes.ebi.ac.uk)|193.62.193.161|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6448195 (6.1M) [application/x-gzip]
Saving to: ‘Solanum_lycopersicum.SL3.0.61.chr.gff3.gz’


2025-05-25 12:27:10 (7.61 MB/s) - ‘Solanum_lycopersicum.SL3.0.61.chr.gff3.gz’ saved [6448195/6448195]



In [None]:
!gunzip /content/Solanum_lycopersicum.SL3.0.61.chr.gff3.gz

In [None]:
!htseq-count -f bam -s no sample.sorted.bam /content/tomato.gtf > counts.txt

100000 GFF lines processed.
200000 GFF lines processed.
300000 GFF lines processed.
376550 GFF lines processed.
100000 alignment records processed.
200000 alignment records processed.
300000 alignment records processed.
400000 alignment records processed.
500000 alignment records processed.
600000 alignment records processed.
700000 alignment records processed.
800000 alignment records processed.
900000 alignment records processed.
1000000 alignment records processed.
1100000 alignment records processed.
1200000 alignment records processed.
1300000 alignment records processed.
1400000 alignment records processed.
1500000 alignment records processed.
1600000 alignment records processed.
1700000 alignment records processed.
1800000 alignment records processed.
1900000 alignment records processed.
2000000 alignment records processed.
2100000 alignment records processed.
2200000 alignment records processed.
2300000 alignment records processed.
2400000 alignment records processed.
2500000 a

In [None]:
!htseq-count \
  -f bam \
  -s no \               # Strandedness: adjust if needed ('yes' or 'reverse')
  -r pos \              # BAM is coordinate-sorted
  -t exon \             # Count exons (common for GFF3)
  -i Parent \           # Use "Parent" to link exons to genes
  sample.sorted.bam \
  Solanum_lycopersicum.SL3.0.61.chr.gff3 \
  > counts.txt

IndentationError: unexpected indent (<ipython-input-89-8ab41a901e26>, line 2)

In [None]:
# Inspect the GFF3 structure (e.g., for gene/exon entries):
!zcat /content/Solanum_lycopersicum.SL3.0.61.chr.gff3 | grep -E "\tgene\t|\texon\t|\tmRNA\t" | head -n 5 | cut -f9


gzip: /content/tomato.gtf: not in gzip format


In [None]:
# Install gffread
!apt-get install gffread

# Convert GFF3 → GTF
!gffread Solanum_lycopersicum.SL3.0.61.chr.gff3 -T -o tomato.gtf

# Now run htseq-count with the GTF
!htseq-count \
  -f bam \
  -s no \
  -r pos \
  tomato.sorted.bam \
  tomato.gtf \
  > counts.txt

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libgclib3
The following NEW packages will be installed:
  gffread libgclib3
0 upgraded, 2 newly installed, 0 to remove and 93 not upgraded.
Need to get 230 kB of archives.
After this operation, 678 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libgclib3 amd64 0.12.7+ds-4 [162 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 gffread amd64 0.12.7-2build1 [67.4 kB]
Fetched 230 kB in 1s (313 kB/s)
Selecting previously unselected package libgclib3:amd64.
(Reading database ... 127241 files and directories currently installed.)
Preparing to unpack .../libgclib3_0.12.7+ds-4_amd64.deb ...
Unpacking libgclib3:amd64 (0.12.7+ds-4) ...
Selecting previously unselected package gffread.
Preparing to unpack .../gffread_0.12.7-2build1_amd64.deb ...
Unpacking gffread (0.12.7-2build1) .