### This notebook describes how to pull protein sequence fasta files for proteins that were found using FoldSeek searches.

In [None]:
#read in foldseek results 

require(data.table)
data<-as.data.frame(fread("combined-foldseek-results-table.tsv"))
data
colnames(data)[1:2] <- c("gene", "Accession")

# Load the dplyr package
library(dplyr)

#filter tm score >0.2
data <- data %>% filter(TM_v_query > 0.2)
data

#make lists from the filtered data
ARX_Q96QS3=as.list(subset(data, gene == "ARX_Q96QS3")$Accession)
CSTB_P04080=as.list(subset(data, gene == "CSTB_P04080")$Accession)
c9orf72_Q96LT7=as.list(subset(data, gene == "c9orf72_Q96LT7")$Accession)
NOP56_O00567=as.list(subset(data, gene == "NOP56_O00567")$Accession)
JPH3_Q8WXH2=as.list(subset(data, gene == "JPH3_Q8WXH2")$Accession)
HTT_P42858=as.list(subset(data, gene == "HTT_P42858")$Accession)
PRNP_P04156=as.list(subset(data, gene == "PRNP_P04156")$Accession)
COMP_P49747=as.list(subset(data, gene == "COMP_P49747")$Accession)
NIPA1_Q7RTP0=as.list(subset(data, gene == "NIPA1_Q7RTP0")$Accession)
CBL_P22681=as.list(subset(data, gene == "CBL_P22681")$Accession)
VWA1_Q6PCB0=as.list(subset(data, gene == "VWA1_Q6PCB0")$Accession)
FXN_Q16595=as.list(subset(data, gene == "FXN_Q16595")$Accession)
NOTCH2NLC_P0DPK4=as.list(subset(data, gene == "NOTCH2NLC_P0DPK4")$Accession)
AR_P10275=as.list(subset(data, gene == "AR_P10275")$Accession)
ATXN3_P54252=as.list(subset(data, gene == "ATXN3_P54252")$Accession)
TBP_P20226=as.list(subset(data, gene == "TBP_P20226")$Accession)
STARD7_Q9NQZ5=as.list(subset(data, gene == "STARD7_Q9NQZ5")$Accession)
ATXN10_Q9UBB4=as.list(subset(data, gene == "ATXN10_Q9UBB4")$Accession)
DIP2B_Q9P265=as.list(subset(data, gene == "DIP2B_Q9P265")$Accession)

#### Using the lists created above, this code pulls the corresponding protein fasta file from UniProt using the Uniprot id. Each query gene has a single results folder containing all foldseek results for that query.

In [None]:
# Required libraries
library(httr)

# Nested list of UniProt IDs
list_of_lists <- list(ARX_Q96QS3, CSTB_P04080, c9orf72_Q96LT7,NOP56_O00567,JPH3_Q8WXH2,HTT_P42858,PRNP_P04156,COMP_P49747,NIPA1_Q7RTP0,CBL_P22681,VWA1_Q6PCB0,FXN_Q16595,NOTCH2NLC_P0DPK4,AR_P10275,ATXN3_P54252,TBP_P20226,STARD7_Q9NQZ5,ATXN10_Q9UBB4,DIP2B_Q9P265)
names(list_of_lists)=c("ARX_Q96QS3", "CSTB_P04080","c9orf72_Q96LT7","NOP56_O00567","JPH3_Q8WXH2","HTT_P42858","PRNP_P04156","COMP_P49747","NIPA1_Q7RTP0","CBL_P22681","VWA1_Q6PCB0","FXN_Q16595","NOTCH2NLC_P0DPK4","AR_P10275","ATXN3_P54252","TBP_P20226","STARD7_Q9NQZ5","ATXN10_Q9UBB4","DIP2B_Q9P265")


# Loop over each list
for(list_name in names(list_of_lists)) {
  
  # Create a new folder for the amino acid sequences
  dir_name <- paste0(list_name, "_aa")
  #dir.create(dir_name)
  
  # Loop over each UniProt ID in the current list
  for(id in list_of_lists[[list_name]]) {
    
    # Define the URL to retrieve the FASTA sequence
    url <- paste0("https://www.uniprot.org/uniprot/", id, ".fasta")
    
    # Check if the file already exists
    # If it does, skip to the next ID
    fasta_file <- paste0(dir_name, "/", id, ".fasta")
    if(file.exists(fasta_file)) {
      next
    }
    
    # Send a GET request to the UniProt server
    response <- GET(url)
    
    # If the request was successful, save the amino acid sequence to a fasta file
    if(status_code(response) == 200) {
      write(content(response, as = "text"), file = fasta_file)
    }
  }
}


