In [1]:
# !python3 -m pip install python-Levenshtein sklearn ipython-autotime tokenizers

In [2]:
import time
import pandas as pd
from Levenshtein import distance as levenshtein_distance
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("ncbi.csv")

In [4]:
df.head()

Unnamed: 0,accession,protein_accession,collection_date,sgene_begin,sgene_end,location,region,sgene_nucleotide,sgene_protein
0,MN908947.3,QHD43416.1,2019-12,21563,25384,China,Asia,ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTG...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
1,NC_045512.2,YP_009724390.1,2019-12,21563,25384,China,Asia,ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTG...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
2,MN985325.1,QHO60594.1,2020-01-19,21563,25384,USA,North America,ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTG...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
3,MN975262.1,QHN73810.1,2020-01-11,21563,25384,China,Asia,ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTG...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
4,MN938384.1,QHN73795.1,2020-01-10,21531,25352,China: Shenzhen,Asia,ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTG...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...


In [5]:
df.shape

(41678, 9)

In [6]:
df.loc[:10, ['accession', 'region', 'sgene_protein']]

Unnamed: 0,accession,region,sgene_protein
0,MN908947.3,Asia,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
1,NC_045512.2,Asia,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
2,MN985325.1,North America,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
3,MN975262.1,Asia,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
4,MN938384.1,Asia,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
5,MN988713.1,North America,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
6,MN997409.1,North America,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
7,MN994468.1,North America,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
8,MN994467.1,North America,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
9,MN988669.1,Asia,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...


### Find duplicate pairs (region:sgene_nucleotide)

In [7]:
X = df[['region', 'sgene_protein']].drop_duplicates()
X.shape

(4837, 2)

In [8]:
X.head()

Unnamed: 0,region,sgene_protein
0,Asia,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
2,North America,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
16,Oceania,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
19,Europe,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
25,North America,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...


In [9]:
X.groupby('region').nunique()

Unnamed: 0_level_0,sgene_protein
region,Unnamed: 1_level_1
Africa,249
Asia,335
Europe,103
North America,2970
Oceania,1120
South America,60


# Nekreipti demesio kas zemiau

In [None]:
exit()

### Tokenize

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer()
tokenizer = Tokenizer(BPE())

In [None]:
tokenizer.train_from_iterator(iterator=X['sgene_protein'], trainer=trainer)

In [None]:
tokenizer.get_vocab_size()

In [None]:
output = tokenizer.encode(X.iloc[120]['sgene_protein'])

### Visualise with DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.cluster import dbscan
from sklearn.manifold import TSNE


In [None]:
def levenshtein_metric(x, y):
    return levenshtein_distance(X.iloc[int(x)]['sgene_protein'], 
                                X.iloc[int(y)]['sgene_protein'])

In [None]:
DSET_SIZE = 100
X10 = np.arange(DSET_SIZE).reshape(-1, 1)
palette = {"Europe": "green", 
           "Asia": "orange",
           "North America": "blue",
           "South America": "red",
           "Oceania": "cyan",
           "Africa": "brown"}

# df['region'].unique()

In [None]:
%%time

db = DBSCAN(metric=levenshtein_metric, eps=1, min_samples=2, algorithm='brute', n_jobs=1)


db.fit(X10)

labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print(f"Clusters: {n_clusters_}")
print(f"Noise: {n_noise_}")

In [None]:
tsne = TSNE(n_components=2, random_state=42, metric=levenshtein_metric, square_distances=True, n_iter=250)
pcomp = tsne.fit_transform(X10)


In [None]:
colors = [palette.get(x['region']) for _, x in X.iterrows()]

plt.figure(figsize=(20, 20))
plt.scatter(pcomp[:, 0], pcomp[:, 1], color=colors[:DSET_SIZE])#, alpha=.1, label=y)