In [98]:
import numpy as np
import pandas as pd

In [99]:
df = pd.read_csv('POSEIDON.csv', sep=",", header=0, index_col=False, encoding='latin-1', on_bad_lines='skip')

In [100]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2095 entries, 0 to 2094
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Peptide    2094 non-null   object
 1   Cell line  2082 non-null   object
 2   Cargo      2029 non-null   object
 3   PubmedID   2095 non-null   int64 
 4   Uptake     2095 non-null   object
 5   Units      2059 non-null   object
 6   Conc.      1903 non-null   object
 7   Time       2006 non-null   object
 8   Temp.      1537 non-null   object
 9   Method     2087 non-null   object
 10  Type       2086 non-null   object
 11  Sequence   2091 non-null   object
dtypes: int64(1), object(11)
memory usage: 196.5+ KB


In [101]:
df['Uptake'] = df['Uptake'].str.extract(r'(\d+)')[0].astype(float)

In [102]:
df['Method'].value_counts()

Method
Flow cytometry                           1348
Fluorescent Microscopy                    229
Fluorescence spectroscopy                 215
CLSM                                      108
Cytotoxicity analysis                      60
HPCL-analysis                              25
MALDI-TOF                                  18
ELISA                                      13
Gamma-well counter                         12
FT-IR                                      12
FCS                                         6
Fluorometer                                 6
Immunocytochemistry                         6
Immunohistochemistry                        5
Immunofluorescence analysis                 5
Confocal laser scanning system (clss)       4
Western blot                                4
Fluoroskan plate reader                     4
Liquid chromatography                       3
BCARS Imaging                               1
Fluorescence imaging                        1
STEM                       

In [103]:
import re

def convert_to_micromoles(conc):
    # Convert to string and remove whitespace
    conc_str = str(conc).strip()
    # Use regex to extract the numeric part and the unit
    match = re.match(r'([\d.]+)\s*(nM|uM)', conc_str)
    if match:
        value = float(match.group(1))
        unit = match.group(2)
        # Convert to micromoles
        if unit == 'nM':
            return value / 1000.0  # Convert nM to µM
        elif unit == 'uM':
            return value  # Already in µM
    return None  # In case of any unexpected format

In [104]:
df['Conc.'] = df['Conc.'].apply(convert_to_micromoles)

In [105]:
def convert_to_minutes(time):
    if pd.isna(time):  # Check if the value is NaN
        return 0  # Or return an appropriate value like None or a specific number
    
    time_str = str(time).strip()  # Convert to string and strip whitespace
    total_minutes = 0
    
    # Match hours and minutes separately
    hour_match = re.search(r'(\d+)\s*h', time_str)
    minute_match = re.search(r'(\d+)\s*min', time_str)
    
    if hour_match:
        total_minutes += int(hour_match.group(1)) * 60  # Convert hours to minutes
    
    if minute_match:
        total_minutes += int(minute_match.group(1))  # Add minutes
    
    return total_minutes

In [106]:
df['Time'] = df['Time'].apply(convert_to_minutes)

In [107]:
df['Temp.'] = df['Temp.'].str.replace('ºC', '', regex=False)  # Remove 'oC'
df['Temp.'] = pd.to_numeric(df['Temp.'], errors='coerce')  # Convert to float, coerce errors to NaN

In [108]:
valid_amino_acids = set("ACDEFGHIKLMNPQRSTVWY")

def is_valid_sequence(sequence):
    # Check if the sequence is a string and contains only valid amino acids
    if isinstance(sequence, str):
        return all(char in valid_amino_acids for char in sequence)
    return False  # Non-string sequences (like NaN) are considered invalid

# Filter the DataFrame to keep only valid sequences
df = df[df['Sequence'].apply(is_valid_sequence)]

In [109]:
flow_df = df[(df['Method'] == 'Flow cytometry') & (df['Uptake'] != 0)]

flow_df = flow_df.groupby(['Sequence']).agg({'Uptake': ['min', 'max']})['Uptake']
flow_df = flow_df[(flow_df['min'] == flow_df['max'])]
flow_df = flow_df['min'].reset_index()
flow_df = flow_df.rename(columns={"min": "uptake"})
flow_df

Unnamed: 0,Sequence,uptake
0,AAKLLPDLLAAP,1.0
1,AAVLLPVLLAAP,1.0
2,ACGRGRGRCRGRGRGCG,20.0
3,ACRGSGRGCGRGSGRCG,3.0
4,ACRRSRRGCGRRSRRCG,45.0
...,...,...
241,YGRKKRPQRRR,1.0
242,YGRKKRRQRRRC,9.0
243,YKQCHKKGGHCFPKEKICLPPSSDFGKMDCRWRWKCCKKGSG,46.0
244,YSSYSAPVSSSLSVRRSYSSSSGS,17.0


In [115]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

records = [SeqRecord(Seq(row["Sequence"]), id=str(index)) for index, row in flow_df.iterrows()]

SeqIO.write(records, "flow_sequences.fasta", "fasta")
# https://www.ebi.ac.uk/jdispatcher/msa/clustalo/summary?jobId=clustalo-I20240721-202427-0705-32047303-p1m&js=pass

246

In [121]:
flow_sorted_df = flow_df.sort_values(by='uptake', ascending=False)
flow_sorted_first_10_percent_df = flow_sorted_df.iloc[:int(len(flow_sorted_df) * 0.1)]

records = [SeqRecord(Seq(row["Sequence"]), id=str(index)) for index, row in flow_sorted_first_10_percent_df.iterrows()]
SeqIO.write(records, "flow_first_sequences.fasta", "fasta")
flow_sorted_first_10_percent_df

Unnamed: 0,Sequence,uptake
234,VQWRIRVAVIRK,130000.0
110,KWRRKLKKLRPKKKRKV,50100.0
96,KKYRGRKRHPR,23640.0
12,APKRKKLKKRF,21370.0
68,GRKAARAPGRRKQ,18640.0
103,KQPRIKRKK,18400.0
118,LKKRRKLPKKKPIRNEQ,17500.0
29,CREKAPLGLAGRKKRRQRRRC,14110.0
186,RQIKIWFQNRRMKWKKK,13000.0
9,AGSHRRL,9320.0


In [177]:
from collections import Counter

def get_ngrams_set(sequence, n):
    return set([sequence[i:i+n] for i in range(len(sequence) - n + 1)])

for_ngram_top = flow_sorted_first_10_percent_df
for_ngram_all = flow_df

most_common_ngrams_dfs = list()
for n in range(3, 7):
    for_ngram_top['ngrams'] = for_ngram_top['Sequence'].apply(lambda x: get_ngrams_set(x,n))
    all_ngrams = [ngram for ngrams in for_ngram_top['ngrams'] for ngram in ngrams]
    ngram_counter = Counter(all_ngrams)
    most_common_ngrams = ngram_counter.most_common()
    most_common_ngrams_dfs.append(pd.DataFrame(most_common_ngrams, columns=[f"{n}gram_top", f"count_{n}gram"]))
    
    for_ngram_all['ngrams'] = for_ngram_all['Sequence'].apply(lambda x: get_ngrams_set(x,n))
    all_ngrams = [ngram for ngrams in for_ngram_all['ngrams'] for ngram in ngrams]
    ngram_counter = Counter(all_ngrams)
    most_common_ngrams = ngram_counter.most_common()
    most_common_ngrams_dfs.append(pd.DataFrame(most_common_ngrams, columns=[f"{n}gram_all", f"count_{n}gram"]))

result_df = pd.concat(most_common_ngrams_dfs, axis=1)
result_df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  for_ngram_top['ngrams'] = for_ngram_top['Sequence'].apply(lambda x: get_ngrams_set(x,n))


Unnamed: 0,3gram_top,count_3gram,3gram_all,count_3gram.1,4gram_top,count_4gram,4gram_all,count_4gram.1,5gram_top,count_5gram,5gram_all,count_5gram.1,6gram_top,count_6gram,6gram_all,count_6gram.1
0,RRR,7.0,RRR,68.0,RRRR,4.0,RRRR,34.0,RRRRR,4.0,RRRRR,29.0,RRRRRR,4.0,RRRRRR,27
1,KKR,5.0,KKR,39.0,PKKK,3.0,RKKR,32.0,LKKLR,2.0,KRRQR,22.0,KLKKLR,2.0,KKRRQR,20
2,LKK,4.0,KRR,36.0,KLKK,3.0,RRQR,26.0,RRKLK,2.0,KKRRQ,21.0,KKKRKV,2.0,KRRQRR,19
3,RRK,4.0,RKK,35.0,RRKL,3.0,KKRR,24.0,KLKKL,2.0,RRQRR,21.0,RRKLKK,2.0,RKKRRQ,19
4,KKK,4.0,RRQ,31.0,KKRR,3.0,RQRR,24.0,PKKKR,2.0,RKKRR,21.0,WRRKLK,2.0,RRQRRR,17
5,KRK,4.0,RQR,30.0,KWRR,2.0,KRRQ,23.0,KKRKV,2.0,RQRRR,20.0,KWRRKL,2.0,YSPTTR,16
6,GRK,4.0,QRR,28.0,LKKL,2.0,QRRR,22.0,KKKRK,2.0,TRRYG,19.0,PKKKRK,2.0,SPTTRR,16
7,RKK,4.0,GRK,25.0,KRKV,2.0,TRRY,20.0,WRRKL,2.0,GRKKR,18.0,RKLKKL,2.0,TTRRYG,16
8,KRR,4.0,YSP,24.0,RKLK,2.0,RRYG,20.0,RKLKK,2.0,YSPTT,17.0,RRQRRR,2.0,PTTRRY,16
9,PKK,3.0,TRR,23.0,KKLR,2.0,PTTR,18.0,KWRRK,2.0,PTTRR,17.0,GRKKRR,2.0,WMRWYS,14


### Описание
3грам - LKK
- https://www.sciencedirect.com/science/article/abs/pii/S0021979709015215
- https://www.pnas.org/doi/abs/10.1073/pnas.91.17.8152

4грам - PKKK
- https://academic.oup.com/genetics/article/149/4/1753/6034271
- https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1002385
- https://www.jbc.org/article/S0021-9258(19)53186-6/fulltext

5грам - LKKLR
- https://www.cell.com/AJHG/fulltext/S0006-3495(00)76656-2
- https://link.springer.com/chapter/10.1007/978-1-4615-0667-6_15

In [173]:
def get_ngrams(sequence, n):
    return [sequence[i:i+n] for i in range(len(sequence) - n + 1)]

for_ngram_top = flow_sorted_first_10_percent_df
for_ngram_all = flow_df

most_common_ngrams_dfs = list()
for n in range(3, 7):
    for_ngram['ngrams'] = for_ngram['Sequence'].apply(lambda x: get_ngrams(x,n))
    all_ngrams = [ngram for ngrams in for_ngram['ngrams'] for ngram in ngrams]
    ngram_counter = Counter(all_ngrams)
    most_common_ngrams = ngram_counter.most_common()
    most_common_ngrams_dfs.append(pd.DataFrame(most_common_ngrams, columns=[f"{n}gram_top", f"count_{n}gram"]))
    
    for_ngram_all['ngrams'] = for_ngram_all['Sequence'].apply(lambda x: get_ngrams(x,n))
    all_ngrams = [ngram for ngrams in for_ngram_all['ngrams'] for ngram in ngrams]
    ngram_counter = Counter(all_ngrams)
    most_common_ngrams = ngram_counter.most_common()
    most_common_ngrams_dfs.append(pd.DataFrame(most_common_ngrams, columns=[f"{n}gram_all", f"count_{n}gram"]))

result_df = pd.concat(most_common_ngrams_dfs, axis=1)
result_df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  for_ngram['ngrams'] = for_ngram['Sequence'].apply(lambda x: get_ngrams(x,n))


Unnamed: 0,3gram_top,count_3gram,3gram_all,count_3gram.1,4gram_top,count_4gram,4gram_all,count_4gram.1,5gram_top,count_5gram,5gram_all,count_5gram.1,6gram_top,count_6gram,6gram_all,count_6gram.1
0,RRR,32.0,RRR,275.0,RRRR,25.0,RRRR,201.0,RRRRR,21.0,RRRRR,163.0,RRRRRR,17.0,RRRRRR,132
1,HHH,10.0,HHH,93.0,HHHH,9.0,HHHH,80.0,HHHHH,8.0,HHHHH,68.0,HHHHHH,7.0,HHHHHH,57
2,KKR,6.0,KKR,45.0,LLLL,4.0,RKKR,32.0,LLLLL,3.0,KRRQR,22.0,KWRRKL,2.0,KKRRQR,20
3,LLL,5.0,KRR,36.0,RRKL,3.0,RRQR,26.0,KWRRK,2.0,KKRRQ,21.0,WRRKLK,2.0,KRRQRR,19
4,ALK,5.0,RKK,35.0,KLKK,3.0,KKRR,24.0,WRRKL,2.0,RRQRR,21.0,RRKLKK,2.0,RKKRRQ,19
5,RRK,4.0,RRQ,31.0,PKKK,3.0,RQRR,24.0,RRKLK,2.0,RKKRR,21.0,RKLKKL,2.0,RRQRRR,17
6,LKK,4.0,RQR,30.0,KKRR,3.0,KRRQ,23.0,RKLKK,2.0,RQRRR,20.0,KLKKLR,2.0,YSPTTR,16
7,KKK,4.0,QRR,28.0,KWRR,2.0,QRRR,22.0,KLKKL,2.0,TRRYG,19.0,PKKKRK,2.0,SPTTRR,16
8,KRK,4.0,RHR,26.0,WRRK,2.0,TRRY,20.0,LKKLR,2.0,GRKKR,18.0,KKKRKV,2.0,PTTRRY,16
9,GRK,4.0,GRK,25.0,RKLK,2.0,RRYG,20.0,PKKKR,2.0,YSPTT,17.0,GRKKRR,2.0,TTRRYG,16


In [111]:
fluorescent_microscopy_df = df[(df['Method'] == 'Fluorescent Microscopy') & (df['Uptake'] != 0)]

fluorescent_microscopy_df = fluorescent_microscopy_df.groupby(['Sequence']).agg({'Uptake': ['min', 'max']})['Uptake']
fluorescent_microscopy_df = fluorescent_microscopy_df[(fluorescent_microscopy_df['min'] == fluorescent_microscopy_df['max'])]
fluorescent_microscopy_df = fluorescent_microscopy_df['min'].reset_index()
fluorescent_microscopy_df = fluorescent_microscopy_df.rename(columns={"min": "uptake"})
records = [SeqRecord(Seq(row["Sequence"]), id=str(index)) for index, row in fluorescent_microscopy_df.iterrows()]

SeqIO.write(records, "fluorescent_microscopy_sequences.fasta", "fasta")
# https://www.ebi.ac.uk/jdispatcher/msa/clustalo/summary?jobId=clustalo-I20240722-090845-0839-15371912-p1m&js=pass

51

In [112]:
fluorescence_spectroscopy_df = df[(df['Method'] == 'Fluorescence spectroscopy') & (df['Uptake'] != 0)]

fluorescence_spectroscopy_df = fluorescence_spectroscopy_df.groupby(['Sequence']).agg({'Uptake': ['min', 'max']})['Uptake']
fluorescence_spectroscopy_df = fluorescence_spectroscopy_df[(fluorescence_spectroscopy_df['min'] == fluorescence_spectroscopy_df['max'])]
fluorescence_spectroscopy_df = fluorescence_spectroscopy_df['min'].reset_index()
fluorescence_spectroscopy_df = fluorescence_spectroscopy_df.rename(columns={"min": "uptake"})

records = [SeqRecord(Seq(row["Sequence"]), id=str(index)) for index, row in fluorescence_spectroscopy_df.iterrows()]

SeqIO.write(records, "fluorescence_spectroscopy_sequences.fasta", "fasta")
# https://www.ebi.ac.uk/jdispatcher/msa/clustalo/summary?jobId=clustalo-I20240722-091155-0128-55068385-p1m&js=pass

82

In [113]:
clsm = df[(df['Method'] == 'CLSM') & (df['Uptake'] != 0)]

clsm = clsm.groupby(['Sequence']).agg({'Uptake': ['min', 'max']})['Uptake']
clsm = clsm[(clsm['min'] == clsm['max'])]
clsm = clsm['min'].reset_index()
clsm = clsm.rename(columns={"min": "uptake"})

records = [SeqRecord(Seq(row["Sequence"]), id=str(index)) for index, row in clsm.iterrows()]

SeqIO.write(records, "clsm_sequences.fasta", "fasta")
# https://www.ebi.ac.uk/jdispatcher/msa/clustalo/summary?jobId=clustalo-I20240722-092815-0217-62452346-p1m&js=pass

18

In [186]:
flow_df[flow_df['Sequence'].str.contains('rhrhrh', case = False)]

Unnamed: 0,Sequence,uptake,ngrams
83,HRHRHRHR,10.0,"{RHRHRH, HRHRHR}"
152,RHRHRHRHR,50.0,"{HRHRHR, RHRHRH}"
191,RRHRHRHRHR,100.0,"{HRHRHR, RRHRHR, RHRHRH}"
192,RRRHRHRHRHR,250.0,"{HRHRHR, RRHRHR, RHRHRH, RRRHRH}"
195,RRRRHRHRHRHR,400.0,"{RHRHRH, RRRRHR, HRHRHR, RRHRHR, RRRHRH}"
197,RRRRRHRHRHRHR,500.0,"{RHRHRH, RRRRHR, HRHRHR, RRRRRH, RRHRHR, RRRHRH}"


In [187]:
flow_df[flow_df['Sequence'].str.contains('spttrr', case = False)]

Unnamed: 0,Sequence,uptake,ngrams
11,ALWMRWYSPTTRRYG,220.0,"{YSPTTR, WMRWYS, SPTTRR, TTRRYG, LWMRWY, ALWMR..."
100,KLWMRWYSPTTRRYG,790.0,"{YSPTTR, WMRWYS, SPTTRR, TTRRYG, LWMRWY, KLWMR..."
145,RAWMRWYSPTTRRYG,800.0,"{YSPTTR, WMRWYS, SPTTRR, TTRRYG, RWYSPT, WYSPT..."
164,RLAMRWYSPTTRRYG,750.0,"{YSPTTR, RLAMRW, AMRWYS, SPTTRR, TTRRYG, LAMRW..."
165,RLFMRFYSPTTRRYG,830.0,"{MRFYSP, YSPTTR, PTTRRY, FYSPTT, FMRFYS, RFYSP..."
166,RLIMRIYSPTTRRYG,860.0,"{YSPTTR, PTTRRY, IYSPTT, SPTTRR, RIYSPT, IMRIY..."
167,RLLMRLYSPTTRRYG,440.0,"{LMRLYS, YSPTTR, RLLMRL, LLMRLY, RLYSPT, SPTTR..."
168,RLVMRVYSPTTRRYG,590.0,"{VMRVYS, VYSPTT, YSPTTR, SPTTRR, TTRRYG, LVMRV..."
169,RLWARWYSPTTRRYG,750.0,"{RLWARW, ARWYSP, YSPTTR, SPTTRR, TTRRYG, RWYSP..."
171,RLWMAWYSPTTRRYG,90.0,"{YSPTTR, MAWYSP, RLWMAW, WMAWYS, LWMAWY, SPTTR..."
