In [53]:
import pandas as pd


In [54]:
#Loading annotation and label file
data = pd.read_csv(r'.\AF50m_subset_REGEX_man_labels_5k.txt', sep="\t")

In [55]:
data.head()

Unnamed: 0,protein_annotation,regex_label,manual_label,note
0,NADPH-dependent 7-cyano-7-deazaguanine reducta...,proper,proper,
1,Hydrogen peroxide-inducible genes activator > ...,proper,proper,
2,Scoulerine-9-O-methyltransferase 1,proper,proper,
3,PadR domain-containing protein,proper,proper,
4,protein mono-ADP-ribosyltransferase PARP9 isof...,proper,proper,


In [56]:
# How many annotations have been labeled manually?
print(data["manual_label"].notna().sum())

5000


In [57]:
# Getting the annotatons that have been labeled manually
annotation = data.loc[data["manual_label"].notna(), ["protein_annotation"]]

# checking that the right amount of annotations were selected
len(annotation.index)

5000

In [58]:
#inspecting the annotations
annotation.head()

# things are rough my man
# so annotations contain numbers, dashes, colons...
# uppercase terms to but i do think that's less important. 

Unnamed: 0,protein_annotation
0,NADPH-dependent 7-cyano-7-deazaguanine reducta...
1,Hydrogen peroxide-inducible genes activator > ...
2,Scoulerine-9-O-methyltransferase 1
3,PadR domain-containing protein
4,protein mono-ADP-ribosyltransferase PARP9 isof...


In [None]:
#preprocessing goals
# normalize- lowercase and remove extra whitespace
# oo should also remove commas
# however, want to keep things like GO:0001234 together
# consider stopwords

# for colons- if in a word keep if end of word remove....

In [59]:
import re

# hi thoughts
# don't want to make _ blanspaces to maintain code identity
# do want to make / blankspaces

def clean_text(text):
    text = text.lower()  # Lowercase
    text = text.replace("/", " ")  # Replace slashes with spaces
    text = re.sub(r"\s+", " ", text)  # remove multiple spaces
    text = re.sub(r"[^a-z0-9:\-\. ]", "", text)  # keep alphanum, colon, dash, period
    return text.strip()



In [60]:
#generating cleaned annotations
cleaned_annotations = annotation["protein_annotation"].apply(clean_text)

In [61]:
#for tokenisation, want to keep things like GO terms with : together

token_pattern = r"[A-Za-z0-9:\-\.']+"

In [62]:
#trying to figure out what stop words to use
# considering this is a more nische context (biological) words may differ

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# gets built in english stopwords from sklearn
stopwords = list(ENGLISH_STOP_WORDS)

# so my problem is what is a stopword in this context?
# things that may appear frequently like "uncharacterized" is a signal, so I don't want to remove

#lets try without any stopwords first


In [63]:
# stopwords 2.0

# want to see the most common words in the annotations
from collections import Counter

def tokenize(text):
    return re.findall(token_pattern, text.lower())

all_tokens = [tok for text in cleaned_annotations for tok in tokenize(text)]
token_counts = Counter(all_tokens)
print(token_counts.most_common(30))

[('protein', 2533), ('putative', 601), ('family', 537), ('domain-containing', 439), ('domain', 268), ('phage', 261), ('dna', 257), ('subunit', 242), ('uncharacterized', 174), ('isoform', 165), ('tail', 131), ('regulator', 128), ('1', 119), ('factor', 117), ('2', 107), ('prophage', 105), ('transcriptional', 103), ('reductase', 101), ('helicase', 99), ('and', 87), ('type', 85), ('a', 81), ('hydrolase', 76), ('polymerase', 75), ('synthase', 74), ('of', 71), ('atpase', 67), ('kelch', 66), ('rna', 66), ('cell', 60)]


In [None]:
# so from this we find that protein is super common, buuuuut also putative
# want to keep putative, might try not removing any stopwords for now

In [64]:
#so let's try TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    token_pattern=token_pattern, # already have it
    lowercase=True,
    stop_words=stopwords, # only english atm
    ngram_range=(1,2), # might want to increase
    min_df=2, # words should appear in at least 2 annotations
    max_df=0.8 # wods that appear in more than 80% of annotations are ignored
)



In [65]:
tfidf = vectorizer.fit(cleaned_annotations)
len(tfidf.vocabulary_)
list(tfidf.vocabulary_.keys())[:100]  # Display the first 100 feature names


['nadph-dependent',
 '7-cyano-7-deazaguanine',
 'reductase',
 '7-cyano-7-deazaguanine reductase',
 'hydrogen',
 'peroxide-inducible',
 'genes',
 'activator',
 'oxyr',
 'hydrogen peroxide-inducible',
 'peroxide-inducible genes',
 'genes activator',
 'activator oxyr',
 '1',
 'padr',
 'domain-containing',
 'protein',
 'domain-containing protein',
 'isoform',
 'x4',
 'isoform x4',
 'chromosome',
 'replication',
 'chromosome replication',
 'g',
 'u',
 'mismatch-specific',
 'dna',
 'glycosylase',
 'modular',
 'u mismatch-specific',
 'dna glycosylase',
 'modular protein',
 'p22',
 'coat',
 '-',
 '5',
 'family',
 'p22 coat',
 'protein 5',
 'family protein',
 'unplaced',
 'genomic',
 'scaffold',
 'genome',
 'shotgun',
 'sequence',
 'unplaced genomic',
 'genomic scaffold',
 'genome shotgun',
 'shotgun sequence',
 'kaic',
 'putative',
 'circadian',
 'clock',
 'circadian clock',
 'clock protein',
 'uncharacterized',
 'x5',
 'uncharacterized protein',
 'isoform x5',
 'rnase',
 'transport',
 'transp

In [66]:
# inspecting some of the TF-IDF values
feature_names = vectorizer.get_feature_names_out()
print(feature_names[:100])  # Display the first 100 feature names

['-' '0' '0 sporulation' '1' '1 isoform' '1 like' '1 member' '1 protein'
 '1 rna' '1 subunit' '1-like' '1-like isoform' '1-like protein'
 '1-phosphate' '1-phosphate phosphatase' '10' '11' '12' '12-dioxygenase'
 '13' '14' '15' '16' '17' '19' '1b' '2' '2 chloroplastic' '2 conserved'
 '2 domain' '2 isoform' '2 mitochondrial' '2 protein' '2-1'
 '2-1 chloroplastic' '2-dehydrogenase' '2-deoxyribosyltransferase'
 '2-like' '2-like 1' '2-like isoform' '2-oxoglutarate'
 '2-oxoglutarate 5-dioxygenase' '2-oxoglutarate iron-dependent' '20'
 '20 chloroplastic-like' '20s' '20s proteasome' '23' '23s' '23s rrna' '25'
 '26' '26s' '26s protease' '26s proteasome' '29' '29 isoform' '2a'
 '2og-feii' '2og-feii oxygenase' '3' '3 alpha' '3 isoform' '3-5'
 '3-5 exonuclease' '3-5 rna' '3-dehydrogenase' '3-like' '3-like protein'
 '3-oxoacyl-acyl-carrier-protein'
 '3-oxoacyl-acyl-carrier-protein reductase' '3-phosphatase'
 '3-pyrophosphohydrolase' '30' '31' '32' '33' '35' '3a-like' '4'
 '4 chloroplastic' '4-dehydr