In [47]:
!pip install webdriver_manager

Collecting webdriver_manager
  Using cached webdriver_manager-3.5.2-py2.py3-none-any.whl (17 kB)
Collecting configparser
  Downloading configparser-5.2.0-py3-none-any.whl (19 kB)
Collecting crayons
  Using cached crayons-0.4.0-py2.py3-none-any.whl (4.6 kB)
Collecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: colorama, crayons, configparser, webdriver-manager
Successfully installed colorama-0.4.4 configparser-5.2.0 crayons-0.4.0 webdriver-manager-3.5.2


In [48]:
import pandas as pd
import requests
import re
import nltk
import warnings
import csv

import sys
sys.path.append('/mnt/c/Users/cleon/Documents/CAL_CAL/ironhack/Final_project/Redefining_Cancer_treatment/src')
import datafunctions as dataf

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager

from nltk.corpus import stopwords

warnings.filterwarnings("ignore");
nltk.download('stopwords');

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cleonortiz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
trainvariants = pd.read_csv('../data/training_variants')
testvariants = pd.read_csv('../data/test_variants')
trainvariants.sample(5)

Unnamed: 0,ID,Gene,Variation,Class
1065,1065,EWSR1,EWSR1-ETV4 Fusion,2
500,500,TP53,E258V,1
1213,1213,PIK3CA,H1047Y,7
1488,1488,FGFR2,C382R,7
2845,2845,BRCA2,A1170V,6


- *training_variants* and *test_variants* are a comma separated file containing the description of the genetic mutations used for training.
- Fields are:
    - ID: the id of the row used to link the mutation to the clinical evidence
    - Gene: the gene where this genetic mutation is located
    - Variation: the aminoacid change for this mutations
    -Class: 1-9 the class this genetic mutation has been classified on. Classes are as follow:
    
        1. Likely loss-of-function
        2. Likely Gain-of-function
        3. Neutral
        4. Loss-of-function
        5. Likely Neutral
        6. Inconclusive
        7. Gain-of-function
        8. Likely Switch-of-function
        9. Switch-of-function

In [4]:
trainvariants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3321 entries, 0 to 3320
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         3321 non-null   int64 
 1   Gene       3321 non-null   object
 2   Variation  3321 non-null   object
 3   Class      3321 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 103.9+ KB


In [5]:
traintext = pd.read_csv('../data/training_text',sep="\|\|",engine="python",names=["ID","TEXT"],skiprows=1)
testtext = pd.read_csv('../data/test_text',sep="\|\|",engine="python",names=["ID","TEXT"],skiprows=1)

In [6]:
traintext.head()

Unnamed: 0,ID,TEXT
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [7]:
traintext.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3321 entries, 0 to 3320
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      3321 non-null   int64 
 1   TEXT    3316 non-null   object
dtypes: int64(1), object(1)
memory usage: 52.0+ KB


In [8]:
traintext.head()

Unnamed: 0,ID,TEXT
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [9]:
trainvariants.Gene.unique()

array(['FAM58A', 'CBL', 'SHOC2', 'TERT', 'DICER1', 'PTPRT', 'RHEB',
       'SHQ1', 'CCND2', 'RAD50', 'CCND3', 'RIT1', 'CCNE1', 'RYBP',
       'TGFBR1', 'TGFBR2', 'MSH6', 'KMT2D', 'LATS1', 'PBRM1', 'SF3B1',
       'LATS2', 'EGFR', 'NKX2-1', 'EIF1AX', 'ARID2', 'BRD4', 'HIST1H1C',
       'ERRFI1', 'CHEK2', 'PAK1', 'TMPRSS2', 'H3F3A', 'ELF3', 'ROS1',
       'ASXL2', 'CDH1', 'EPCAM', 'EP300', 'EPAS1', 'TP53', 'TP53BP1',
       'SMAD2', 'SMAD3', 'SMAD4', 'CDK4', 'AURKB', 'CDK6', 'FBXW7',
       'CDK8', 'CDKN1A', 'CDKN1B', 'CDKN2A', 'CDKN2B', 'CDKN2C', 'ASXL1',
       'ERBB2', 'ERBB3', 'ERBB4', 'ERCC2', 'BRIP1', 'ERCC3', 'ERCC4',
       'ABL1', 'CEBPA', 'ERG', 'HLA-A', 'HLA-B', 'PDGFRA', 'PDGFRB',
       'RBM10', 'KDM5C', 'ESR1', 'ETV1', 'ETV6', 'TSC1', 'IKZF1', 'TSC2',
       'EWSR1', 'ACVR1', 'FOXA1', 'MDM2', 'ARID1A', 'EZH2', 'MDM4',
       'KLF4', 'RXRA', 'MAP3K1', 'AXIN1', 'BAP1', 'MEN1', 'FANCA',
       'FANCC', 'MET', 'U2AF1', 'KMT2C', 'FAT1', 'ARID1B', 'PIK3CA',
       'PIK3CB', 'PIM1

In [10]:
uvariants = trainvariants.Variation.unique().size
nvariants = trainvariants.Variation.size

print(f'Number of different variants: {uvariants}\nNumber of columns: {nvariants}')

Number of different variants: 2996
Number of columns: 3321


In [25]:
Description = []
for gen in trainvariants.Gene[:3]:
    url = 'https://www.genenetwork.nl/api/v1/gene/' + gen
    dic = requests.get(url).json()
    Description.append(dic['gene']['description'])

In [26]:
Description

['family with sequence similarity 58, member A [Source:HGNC Symbol;Acc:HGNC:28434]',
 'Cbl proto-oncogene, E3 ubiquitin protein ligase [Source:HGNC Symbol;Acc:HGNC:1541]',
 'Cbl proto-oncogene, E3 ubiquitin protein ligase [Source:HGNC Symbol;Acc:HGNC:1541]']

trainvariants['Description'] = Description

In [11]:
trainvariants = dataf.typeeffect(trainvariants)
testvariants = dataf.typeeffect(testvariants)

In [12]:
trainvariants.sample(5)

Unnamed: 0,ID,Gene,Variation,Class,Type,Effect
864,864,ERG,TMPRSS2-ERG Fusion,7,Deletion,fusion
3103,3103,RAC1,G12V,7,Substitution,missense
3241,3241,DDR2,T654M,6,Substitution,missense
2917,2917,NFE2L2,L30F,7,Substitution,missense
3210,3210,RB1,S567L,1,Substitution,missense


In [13]:
testvariants.sample(5)

Unnamed: 0,ID,Gene,Variation,Type,Effect
5313,5313,PHOX2B,G197D,Substitution,missense
459,459,SERPING1,V454E,Substitution,missense
606,606,PEX13,I326T,Substitution,missense
5590,5590,SDHD,H145N,Substitution,missense
2901,2901,PIKFYVE,K1103R,Substitution,missense


In [14]:
trainvariants = trainvariants[['ID', 'Gene', 'Variation', 'Type', 'Effect', 'Class']]
testvariants = testvariants[['ID', 'Gene', 'Variation', 'Type', 'Effect']]

trainvariants.set_index('ID',inplace=True)
testvariants.set_index('ID',inplace=True)
trainvariants.head()

In [15]:
trainvariants.to_csv('../trainvariantswithtypeandeffect.csv')
testvariants.to_csv('../testvariantswithtypeandeffect.csv')

In [22]:
result = pd.merge(trainvariants, traintext, on='ID', how='left')
result.head()

Unnamed: 0,ID,Gene,Variation,Type,Effect,Class,TEXT
0,0,FAM58A,Truncating Mutations,unknown,nonsense,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,Substitution,nonsense,2,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,Substitution,missense,2,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,Substitution,missense,3,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,Substitution,missense,4,Oncogenic mutations in the monomeric Casitas B...


In [23]:
result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3321 entries, 0 to 3320
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         3321 non-null   int64 
 1   Gene       3321 non-null   object
 2   Variation  3321 non-null   object
 3   Type       3321 non-null   object
 4   Effect     3321 non-null   object
 5   Class      3321 non-null   int64 
 6   TEXT       3316 non-null   object
dtypes: int64(2), object(5)
memory usage: 207.6+ KB


In [24]:
result.loc[result['TEXT'].isnull(),'TEXT'] = result['Gene'] + ' ' + result['Variation']

In [20]:
result.head()

Unnamed: 0,ID,Gene,Variation,Type,Effect,Class,TEXT
0,0,FAM58A,Truncating Mutations,unknown,nonsense,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,Substitution,nonsense,2,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,Substitution,missense,2,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,Substitution,missense,3,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,Substitution,missense,4,Oncogenic mutations in the monomeric Casitas B...


In [38]:
oncogene = pd.read_csv('../data/ongene_human2.csv',sep="\t")

In [40]:
def cancerous(train, oncogene):
    
    train['Cancerous'] = 'Unknown'
    train['GeneType'] = 'Unknown'
    train['Chromosome'] = 0
    cuenta = 0
    for itrain,rtrain in train.iterrows():
        gene = rtrain['Gene']
        
        for ionco,ronco in oncogene.iterrows():
            if ronco['OncogeneName'] == gene:
                train['Cancerous'][itrain] = 'Oncogene'
                train['GeneType'][itrain] = ronco['GeneType']
                train['Chromosome'][itrain] = ronco['Cytoband'].split('q')[0]
                cuenta +=1
    print(f'{cuenta} genes had been clasified')
    
    return train

In [41]:
oncogene.head()

Unnamed: 0,OncogeneID,OncogeneName,Alias,Links,Cytoband,FullName,GeneType
0,10221,TRIB1,C8FW|GIG-2|GIG2|SKIP1|TRB-1|TRB1,MIM:609461|HGNC:HGNC:16891|Ensembl:ENSG0000017...,8q24.13,tribbles pseudokinase 1,protein-coding
1,7295,TXN,TRDX|TRX|TRX1,MIM:187700|HGNC:HGNC:12435|Ensembl:ENSG0000013...,9q31,thioredoxin,protein-coding
2,867,CBL,C-CBL|CBL2|FRA11B|NSLL|RNF55,MIM:165360|HGNC:HGNC:1541|Ensembl:ENSG00000110...,11q23.3,"Cbl proto-oncogene, E3 ubiquitin protein ligase",protein-coding
3,673,BRAF,B-RAF1|BRAF1|NS7|RAFB1,MIM:164757|HGNC:HGNC:1097|Ensembl:ENSG00000157...,7q34,"B-Raf proto-oncogene, serine/threonine kinase",protein-coding
4,5979,RET,CDHF12|CDHR16|HSCR1|MEN2A|MEN2B|MTC1|PTC|RET-E...,MIM:164761|HGNC:HGNC:9967|Ensembl:ENSG00000165...,10q11.2,ret proto-oncogene,protein-coding


In [42]:
nuevo =cancerous(result,oncogene)

1618 genes had been clasified


In [43]:
nuevo.sample(10)

Unnamed: 0,ID,Gene,Variation,Type,Effect,Class,TEXT,Cancerous,GeneType,Chromosome
1756,1756,IDH1,R132Q,Substitution,missense,8,Current genomic and biochemical analysis revea...,Oncogene,protein-coding,2
2564,2564,BRCA1,T1852S,Substitution,missense,6,Abstract The BRCA1 gene from individuals at ...,Unknown,Unknown,0
1194,1194,PIK3CA,R93W,Substitution,missense,7,"Introduction In 2004, Samuels et al. reported...",Oncogene,protein-coding,3
1857,1857,MTOR,D2512G,Substitution,missense,3,Genes encoding components of the PI3K-Akt-mTOR...,Oncogene,protein-coding,1p36.2
477,477,TP53,G199R,Substitution,missense,1,Inheritance of germ-line mutant alleles of BRC...,Unknown,Unknown,0
1039,1039,TSC2,R462C,Substitution,missense,1,Tuberous sclerosis complex (TSC) is an autosom...,Unknown,Unknown,0
1061,1061,EWSR1,Fusions,Deletion,fusion,2,"Ewing’s sarcoma/PNET, a tumor of the bone and ...",Oncogene,protein-coding,22
1897,1897,MTOR,R2505*,Substitution,nonsense,3,Genes encoding components of the PI3K-Akt-mTOR...,Oncogene,protein-coding,1p36.2
25,25,CBL,H398Y,Substitution,missense,4,Recent evidence has demonstrated that acquired...,Oncogene,protein-coding,11
986,986,ETV6,385_418del,Deletion,unknown,4,Abstract Some familial platelet disorders are...,Oncogene,protein-coding,12p13


In [None]:
# download the driver 
driver = webdriver.Chrome(ChromeDriverManager().install(), options = opciones)

# set the url
url2 = "https://cancer.sanger.ac.uk/census"

# request the url
driver.get(url2)

driver.implicitly_wait(2) # is like a time.sleep but specific of selenium

driver.find_element_by_css_selector("#onetrust-accept-btn-handler").click()
driver.find_element_by_css_selector("#search_query_field").send_keys("rascadores")
driver.implicitly_wait(2)
driver.find_element_by_css_selector("#shop_search_top_submit > span").click()

precio = []
producto = []
descripcion = []