In [1]:
import pandas as pd
import re

In [2]:
pc_genes = pd.read_csv('../../data/pc_genes/uniprot-compressed_true_download_true_fields_accession_2Creviewed_2C-2023.01.13-21.17.12.15.tsv.gz',
                        compression = 'gzip', on_bad_lines='skip', sep = '\t')

# Filtering human genes

In [3]:
pc_genes_human = pc_genes[pc_genes.loc[:,'Organism'] == 'Homo sapiens (Human)']

# Filtering for Protein Existance > 3

In [4]:
pc_genes_human.loc[:,'Protein existence'].unique()

array(['Uncertain', 'Evidence at protein level',
       'Evidence at transcript level', 'Inferred from homology',
       'Predicted'], dtype=object)

In [5]:
existence = pc_genes_human.loc[:,'Protein existence'].unique()
high_conf = existence[[1,2]]
high_conf

array(['Evidence at protein level', 'Evidence at transcript level'],
      dtype=object)

In [6]:
pc_genes_human.loc[:,'Protein existence'] == existence[0] 

31         True
58        False
59         True
75        False
147       False
          ...  
568338     True
568339     True
568340    False
568742     True
568743    False
Name: Protein existence, Length: 20404, dtype: bool

# Number of PCs with high confidence and low confidence

In [7]:
sum(pc_genes_human.loc[:,'Protein existence'].isin(high_conf))

18894

In [8]:
sum(~pc_genes_human.loc[:,'Protein existence'].isin(high_conf))

1510

In [9]:
high_conf_pc = pc_genes_human[pc_genes_human.loc[:,'Protein existence'].isin(high_conf)]

In [10]:
high_conf_pc

Unnamed: 0,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Protein existence
58,A0A0B4J2F0,reviewed,PIOS1_HUMAN,Protein PIGBOS1 (PIGB opposite strand protein 1),PIGBOS1,Homo sapiens (Human),54,Evidence at protein level
75,A0A0C5B5G6,reviewed,MOTSC_HUMAN,Mitochondrial-derived peptide MOTS-c (Mitochon...,MT-RNR1,Homo sapiens (Human),16,Evidence at protein level
147,A0A0K2S4Q6,reviewed,CD3CH_HUMAN,Protein CD300H (CD300 antigen-like family memb...,CD300H,Homo sapiens (Human),201,Evidence at protein level
189,A0A0U1RRE5,reviewed,NBDY_HUMAN,Negative regulator of P-body association (P-bo...,NBDY LINC01420,Homo sapiens (Human),68,Evidence at protein level
212,A0A1B0GTW7,reviewed,CIROP_HUMAN,Ciliated left-right organizer metallopeptidase...,CIROP LMLN2,Homo sapiens (Human),788,Evidence at protein level
...,...,...,...,...,...,...,...,...
567017,Q9H693,reviewed,CP095_HUMAN,Uncharacterized protein C16orf95,C16orf95,Homo sapiens (Human),158,Evidence at transcript level
567018,Q9H7T3,reviewed,CJ095_HUMAN,Uncharacterized protein C10orf95,C10orf95,Homo sapiens (Human),257,Evidence at protein level
567023,Q9HBI5,reviewed,CC014_HUMAN,Uncharacterized protein C3orf14,C3orf14 HT021,Homo sapiens (Human),128,Evidence at transcript level
567619,Q9NZ38,reviewed,IDAS1_HUMAN,Uncharacterized protein IDI2-AS1 (IDI2 antisen...,IDI2-AS1 C10orf110 HT009,Homo sapiens (Human),188,Evidence at transcript level


### Reset index of high_conf_pc

In [11]:
high_conf_pc

Unnamed: 0,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Protein existence
58,A0A0B4J2F0,reviewed,PIOS1_HUMAN,Protein PIGBOS1 (PIGB opposite strand protein 1),PIGBOS1,Homo sapiens (Human),54,Evidence at protein level
75,A0A0C5B5G6,reviewed,MOTSC_HUMAN,Mitochondrial-derived peptide MOTS-c (Mitochon...,MT-RNR1,Homo sapiens (Human),16,Evidence at protein level
147,A0A0K2S4Q6,reviewed,CD3CH_HUMAN,Protein CD300H (CD300 antigen-like family memb...,CD300H,Homo sapiens (Human),201,Evidence at protein level
189,A0A0U1RRE5,reviewed,NBDY_HUMAN,Negative regulator of P-body association (P-bo...,NBDY LINC01420,Homo sapiens (Human),68,Evidence at protein level
212,A0A1B0GTW7,reviewed,CIROP_HUMAN,Ciliated left-right organizer metallopeptidase...,CIROP LMLN2,Homo sapiens (Human),788,Evidence at protein level
...,...,...,...,...,...,...,...,...
567017,Q9H693,reviewed,CP095_HUMAN,Uncharacterized protein C16orf95,C16orf95,Homo sapiens (Human),158,Evidence at transcript level
567018,Q9H7T3,reviewed,CJ095_HUMAN,Uncharacterized protein C10orf95,C10orf95,Homo sapiens (Human),257,Evidence at protein level
567023,Q9HBI5,reviewed,CC014_HUMAN,Uncharacterized protein C3orf14,C3orf14 HT021,Homo sapiens (Human),128,Evidence at transcript level
567619,Q9NZ38,reviewed,IDAS1_HUMAN,Uncharacterized protein IDI2-AS1 (IDI2 antisen...,IDI2-AS1 C10orf110 HT009,Homo sapiens (Human),188,Evidence at transcript level


In [12]:
#Resets index after filtering
high_conf_pc = high_conf_pc.reset_index().iloc[:,1:]

# Check to see the distribution of GO terms for each Protein existence level

In [13]:
GO = pd.read_csv('../../data/GO/pro_GO.csv', sep = ",")

In [14]:
GO

Unnamed: 0,DB_Object_Symbol,GO ID,Aspect,DB Object Name
0,IGKV3-7,GO:0002250,P,Probable non-functional immunoglobulin kappa v...
1,IGKV1D-42,GO:0002250,P,Probable non-functional immunoglobulin kappa v...
2,IGLV4-69,GO:0002250,P,Immunoglobulin lambda variable 4-69
3,IGLV8-61,GO:0002250,P,Immunoglobulin lambda variable 8-61
4,IGLV4-60,GO:0002250,P,Immunoglobulin lambda variable 4-60
...,...,...,...,...
160861,IKBKG,GO:0043123,P,NF-kappa-B essential modulator
160862,EFCAB9,GO:0048240,P,EF-hand calcium-binding domain-containing prot...
160863,KNSTRN,GO:0051988,P,Small kinetochore-associated protein
160864,PTPN23,GO:0045022,P,Tyrosine-protein phosphatase non-receptor type 23


### How many of the first uniprot symbols are the ones that GO uses

In [15]:
GO_symbols = GO.loc[:,'DB_Object_Symbol'].unique()

In [16]:
high_conf_pc

Unnamed: 0,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Protein existence
0,A0A0B4J2F0,reviewed,PIOS1_HUMAN,Protein PIGBOS1 (PIGB opposite strand protein 1),PIGBOS1,Homo sapiens (Human),54,Evidence at protein level
1,A0A0C5B5G6,reviewed,MOTSC_HUMAN,Mitochondrial-derived peptide MOTS-c (Mitochon...,MT-RNR1,Homo sapiens (Human),16,Evidence at protein level
2,A0A0K2S4Q6,reviewed,CD3CH_HUMAN,Protein CD300H (CD300 antigen-like family memb...,CD300H,Homo sapiens (Human),201,Evidence at protein level
3,A0A0U1RRE5,reviewed,NBDY_HUMAN,Negative regulator of P-body association (P-bo...,NBDY LINC01420,Homo sapiens (Human),68,Evidence at protein level
4,A0A1B0GTW7,reviewed,CIROP_HUMAN,Ciliated left-right organizer metallopeptidase...,CIROP LMLN2,Homo sapiens (Human),788,Evidence at protein level
...,...,...,...,...,...,...,...,...
18889,Q9H693,reviewed,CP095_HUMAN,Uncharacterized protein C16orf95,C16orf95,Homo sapiens (Human),158,Evidence at transcript level
18890,Q9H7T3,reviewed,CJ095_HUMAN,Uncharacterized protein C10orf95,C10orf95,Homo sapiens (Human),257,Evidence at protein level
18891,Q9HBI5,reviewed,CC014_HUMAN,Uncharacterized protein C3orf14,C3orf14 HT021,Homo sapiens (Human),128,Evidence at transcript level
18892,Q9NZ38,reviewed,IDAS1_HUMAN,Uncharacterized protein IDI2-AS1 (IDI2 antisen...,IDI2-AS1 C10orf110 HT009,Homo sapiens (Human),188,Evidence at transcript level


### Remove rows with NAN as gene name

In [26]:
gene_names = high_conf_pc.loc[:,'Gene Names']
gene_names

0                         PIGBOS1
1                         MT-RNR1
2                          CD300H
3                  NBDY LINC01420
4                     CIROP LMLN2
                   ...           
18829                    TMEM191A
18830                    C16orf95
18831                    C10orf95
18832               C3orf14 HT021
18833    IDI2-AS1 C10orf110 HT009
Name: Gene Names, Length: 18834, dtype: object

In [27]:
len(gene_names)

18834

In [28]:
high_conf_pc = high_conf_pc.iloc[~gene_names.isna().values,:].reset_index(drop = True)
# Removed 60 Genes

In [29]:
gene_names = high_conf_pc.loc[:,'Gene Names']


### Selet first gene in each col

In [30]:
def get_first_gene_name(gene_names:pd.Series):
    """
    returns an array containing the first gene name from the gene_name column
    """
    mylist = []
    for element in gene_names:
        print(element)
        mylist.append(re.split(' ', element)[0])

    return mylist
    
#processed = [re.split(' ', element)[0] for element in gene_names]


In [31]:
genes = get_first_gene_name(gene_names)

PIGBOS1
MT-RNR1
CD300H
NBDY LINC01420
CIROP LMLN2
TRBC2 TCRBC2
SLC12A8 CCC9
RBM47
TTC26 IFT56
TMEM129
E2F8
UBA6 MOP4 UBE1L2
ESYT2 FAM62B KIAA1228
ESYT3 FAM62C
MED19 LCMR1
BLTP3B KIAA0701 SHIP164 UHRF1BP1L
POTEB3
IGLC7
SHTN1 KIAA1598
SLC5A10 SGLT5
FEZF1 FEZ ZNF312B
TMEM120B
CLRN2
ARHGAP10 GRAF2
IRGM IFI1 IRGM1 LRG47
FAM170A ZNFD
ANO9 PIG5 TMEM16J TP53I5
SLC22A23 C6orf85
ODAM APIN
FAM168B KIAA0280L MANI
PXDNL VPO2
ILVBL AHAS HACL2
UBE2QL1
SYCE3 C22orf41 THEG2
PLEKHG3 KIAA0599
ELOVL7
SSC5D
FSD2 SPRYD1
SH3PXD2B FAD49 KIAA1295 TKS4
CIBAR1 FAM92A FAM92A1
MACROD2 C20orf133
ZC3H12D C6orf95 MCPIP4 TFL
FRMD3 EPB41L4O
CCDC78 C16orf25 JFP10
HFM1 SEC3D1
NBAS NAG
TARS3 TARSL2
TMEM218
TESPA1 KIAA0748 HSPC257
HYKK AGPHD1
MEIOC C17orf104
CCDC66
TYW5 C2orf60
MYBPHL
TMEM131L KIAA0922
SSPOP KIAA2036 SSPO
VWA8 KIAA0564
LAMB4
CRPPA ISPD
FAM221A C7orf46
GSAP PION
GTPBP10 OBGH2 UG0751c10
WDR91 HSPC049
PRSS37 TRYX2
CDC14C CDC14B2 CDC14Bretro
MBLAC1
EME2
XIRP2 CMYA3
CLEC18A MRLP2
VPS37C PML39
ODAD3 CCDC151
CFAP

In [32]:
genes

['PIGBOS1',
 'MT-RNR1',
 'CD300H',
 'NBDY',
 'CIROP',
 'TRBC2',
 'SLC12A8',
 'RBM47',
 'TTC26',
 'TMEM129',
 'E2F8',
 'UBA6',
 'ESYT2',
 'ESYT3',
 'MED19',
 'BLTP3B',
 'POTEB3',
 'IGLC7',
 'SHTN1',
 'SLC5A10',
 'FEZF1',
 'TMEM120B',
 'CLRN2',
 'ARHGAP10',
 'IRGM',
 'FAM170A',
 'ANO9',
 'SLC22A23',
 'ODAM',
 'FAM168B',
 'PXDNL',
 'ILVBL',
 'UBE2QL1',
 'SYCE3',
 'PLEKHG3',
 'ELOVL7',
 'SSC5D',
 'FSD2',
 'SH3PXD2B',
 'CIBAR1',
 'MACROD2',
 'ZC3H12D',
 'FRMD3',
 'CCDC78',
 'HFM1',
 'NBAS',
 'TARS3',
 'TMEM218',
 'TESPA1',
 'HYKK',
 'MEIOC',
 'CCDC66',
 'TYW5',
 'MYBPHL',
 'TMEM131L',
 'SSPOP',
 'VWA8',
 'LAMB4',
 'CRPPA',
 'FAM221A',
 'GSAP',
 'GTPBP10',
 'WDR91',
 'PRSS37',
 'CDC14C',
 'MBLAC1',
 'EME2',
 'XIRP2',
 'CLEC18A',
 'VPS37C',
 'ODAD3',
 'CFAP69',
 'PSMB11',
 'PSD',
 'PEDS1',
 'HTR3E',
 'CNOT1',
 'CARNS1',
 'MEGF11',
 'BDP1',
 'FAM221B',
 'CCDC88B',
 'NKX2-6',
 'IZUMO1R',
 'FAM83G',
 'PALM3',
 'RBMY1B',
 'PGP',
 'TMEM8B',
 'RCCD1',
 'GOLGA6L9',
 'TRABD2B',
 'TCAF2',
 'MFSD2B',
 

In [34]:
len(genes) == len(gene_names)
# Function doesnt have any loss

True

### Append column onto df


In [35]:
high_conf_pc['FirstUniprot'] = genes

In [36]:
high_conf_pc

Unnamed: 0,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Protein existence,FirstUniprot
0,A0A0B4J2F0,reviewed,PIOS1_HUMAN,Protein PIGBOS1 (PIGB opposite strand protein 1),PIGBOS1,Homo sapiens (Human),54,Evidence at protein level,PIGBOS1
1,A0A0C5B5G6,reviewed,MOTSC_HUMAN,Mitochondrial-derived peptide MOTS-c (Mitochon...,MT-RNR1,Homo sapiens (Human),16,Evidence at protein level,MT-RNR1
2,A0A0K2S4Q6,reviewed,CD3CH_HUMAN,Protein CD300H (CD300 antigen-like family memb...,CD300H,Homo sapiens (Human),201,Evidence at protein level,CD300H
3,A0A0U1RRE5,reviewed,NBDY_HUMAN,Negative regulator of P-body association (P-bo...,NBDY LINC01420,Homo sapiens (Human),68,Evidence at protein level,NBDY
4,A0A1B0GTW7,reviewed,CIROP_HUMAN,Ciliated left-right organizer metallopeptidase...,CIROP LMLN2,Homo sapiens (Human),788,Evidence at protein level,CIROP
...,...,...,...,...,...,...,...,...,...
18829,Q9H0A3,reviewed,T191A_HUMAN,Transmembrane protein 191A,TMEM191A,Homo sapiens (Human),160,Evidence at transcript level,TMEM191A
18830,Q9H693,reviewed,CP095_HUMAN,Uncharacterized protein C16orf95,C16orf95,Homo sapiens (Human),158,Evidence at transcript level,C16orf95
18831,Q9H7T3,reviewed,CJ095_HUMAN,Uncharacterized protein C10orf95,C10orf95,Homo sapiens (Human),257,Evidence at protein level,C10orf95
18832,Q9HBI5,reviewed,CC014_HUMAN,Uncharacterized protein C3orf14,C3orf14 HT021,Homo sapiens (Human),128,Evidence at transcript level,C3orf14


# filter out non required cols

In [39]:
high_conf_pc.loc[:,("FirstUniprot","Protein existence", "Protein names")]

Unnamed: 0,FirstUniprot,Protein existence,Protein names
0,PIGBOS1,Evidence at protein level,Protein PIGBOS1 (PIGB opposite strand protein 1)
1,MT-RNR1,Evidence at protein level,Mitochondrial-derived peptide MOTS-c (Mitochon...
2,CD300H,Evidence at protein level,Protein CD300H (CD300 antigen-like family memb...
3,NBDY,Evidence at protein level,Negative regulator of P-body association (P-bo...
4,CIROP,Evidence at protein level,Ciliated left-right organizer metallopeptidase...
...,...,...,...
18829,TMEM191A,Evidence at transcript level,Transmembrane protein 191A
18830,C16orf95,Evidence at transcript level,Uncharacterized protein C16orf95
18831,C10orf95,Evidence at protein level,Uncharacterized protein C10orf95
18832,C3orf14,Evidence at transcript level,Uncharacterized protein C3orf14


# Save

In [40]:
high_conf_pc.loc[:,("FirstUniprot","Protein existence", "Protein names")].to_csv('../../data/pc_genes/processed_uniprot.csv', sep = ",")