# Homework №1 - Data collecting and cleansing

## Data preparing

First of all let's import all the libraries we need

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import requests
import time
from typing import List
from tqdm import tqdm
from tqdm import tqdm_notebook

In [None]:
data = pd.read_csv('./data_2.csv', sep=',')

In [None]:
data

Let's check how many gaps we have in the lines, try to fill them

In [None]:
columns = list(data.columns)

In [None]:
columns

In the inirial dataframe we have several colums leasted above.
Let's determine what value each parameter has:

1) **DOI (Digital Object Identifier)** - is a string of numbers, letters and symbols used to uniquely identify an article or document, and to provide it with a permanent web address (URL);

2) **Date** - date  of article publication;
3) **Journal** - the name of the journal in which the article was published;

4) **Title** - the title of the article;

5) **Name** - the systematic name, trivial name formula of the chemical compound;

6) **measurement_error** -  is the difference between a measured quantity and its true value;

7) **measurement_wavelength** -  important factor in the determination of refractive index using a spectrophotometer or other optical instrument;

8) **measurement_method** - procedures or techniques used to make these assignments and obtain the numerical or symbolic representation of the properties or characteristics being measured;

9) **normalised_name (SMILES)** - standard notation used in chemistry to represent the structure of molecules and chemical reactions using a short, linear string of characters;

10) **raw_value (reflecting index)** - the measure of bending of a light ray when passing from one medium to another during the experiment;

11) **specifier** - the type of raw_value value.

In [None]:
print("Missing values distribution by column: ")
print(data.isnull().mean())
print("")

Now we can easily look at the distribution of missing values in the given dataset, for example: in the column 'measurement_wavelength' 88,58% of data is missing, in 'normalised_name' - 41,16%, which is sucks actually:( Good news: in columns 'DOI', 'measurement_error', 'measurement_method', 'raw_value' and 'specifier' there are no missing values at all! Cool!

In [None]:
print("Column datatypes: ")
print(data.dtypes)

Now let's see at the type of data we have. All the columns have object as their datatype aside from 'measurement_error'. In pandas, object means either string or mixed type (numerical and non-numerical type mixed).

Finally, let’s make sure we remove any trailing characters and whitespace using 'strip':

In [None]:
str_cols = list(data.columns)
str_cols.remove('measurement_error')

In [None]:
for i in str_cols:
    data[i] = data[i].str.strip()

In [None]:
data.head()

In [None]:
missing_by_row = data.isnull().sum(axis=1)
sorted_rows = data.loc[missing_by_row.sort_values(ascending=False).index]
print("Top 10 rows with the most missing values:")
print(sorted_rows.head(10))

Oops, it seems like in some cases in the column 'DOI' the Journal name sticks to the DOI, let's fix it

### Fixing DOI column

Let's have one doi as an example and try to fix it

In [None]:
print(data.loc[3056, 'DOI'])

Let's also convert the 'DOI' column into a list so it would be more comfy to work with ir

In [None]:
DOI_column = data['DOI'].tolist()

To separate the DOI from all the other unnecessary stuff we have in this column, let's use a regular expression

In [None]:
DOI_pattern = re.compile(r'^10\.\d{4,9}\/[-._;()\/:A-Z0-9]+(?=_)')

In [None]:
DOI_example = '10.1016/S0963-9969(01)00105-3Food Research International'
DOI_example

In [None]:
DOI_match_example = re.match(DOI_pattern, DOI_example)

In [None]:
DOI_match_example

Okay, it worked. Now let's try it on the whole column

For that, let's first create a function which will check if the doi valid or not 

In [None]:
def is_valid_doi(doi_str:str) -> bool:
    """
    Check if a DOI is valid and corresponds to an article on the internet.
    
    Args:
        doi_str (str): A string representing the DOI to be checked.
        
    Returns:
        bool: True if the DOI is valid and corresponds to an article with metadata available on the internet, False otherwise.
        
    Example Usage:
        >>> is_valid_doi('10.1016/j.jacc.2020.02.068')
        True
    """
    # Construct the API URL for the DOI
    url = f"https://api.crossref.org/works/{doi_str}"
    
    # Make an HTTP request to the API
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Extract the metadata from the response
        metadata = response.json()["message"]
        # Check if the metadata contains a title (i.e., the DOI is valid)
        if "title" in metadata:
            return True
    # If the request failed or the metadata does not contain a title, the DOI is invalid
    return False

Next let's create a function which will clean the DOI column

In [None]:
def clean_DOI(initial_DOI_list: List[str]) -> List[str]:
    """
    Clean a list of DOIs by removing invalid DOIs and keeping only valid DOIs with metadata available on the internet.

    Args:
        initial_DOI_list (List[str]): A list of strings representing the DOIs to be cleaned.

    Returns:
        List[str]: A list of strings representing the cleaned DOIs. Each element of the returned list is either a valid DOI or the string 'invalid' if the DOI is not valid or does not have metadata available on the internet.

    Example Usage:
        >>> clean_DOI(['10.1016/j.jacc.2020.02.068', '10.3390/bs10010012', '10.1038/nature12373', '10.1162/REST_a_00136'])
        ['10.1016/j.jacc.2020.02.068', '10.3390/bs10010012', 'invalid', 'invalid']
    """
    DOI_pattern = re.compile(r'^10\.\d{4,9}\/[-._;()\/:A-Z0-9]+', flags=re.IGNORECASE)
    cleaned_DOI_list = []
    for DOI in tqdm(initial_DOI_list):
        DOI_match = re.findall(DOI_pattern, DOI)
        if DOI_match:
            DOI_cleaned = DOI_match[0].strip('_')
            if is_valid_doi(DOI_cleaned):
                cleaned_DOI_list.append(DOI_cleaned)
            else:
                cleaned_DOI_list.append('invalid')
        else:
            cleaned_DOI_list.append('invalid')
            print(DOI)
        # Sleep for 0.125 seconds
        time.sleep(0.125)
    return cleaned_DOI_list

In [None]:
DOI_new = clean_DOI(DOI_column)

In [None]:
print(len(DOI_column))
print(len(DOI_new))

In [None]:
DOI_new.count('invalid')

After all, there are still 173 invalid DOI's. Let's try to use another regular expressin on those invalid DOI's

In [None]:
invalid_ids = [i for i, x in enumerate(DOI_new) if x == 'invalid']
print(invalid_ids)

In [None]:
invalid_elements = [DOI_column[i] for i in invalid_ids]
print(invalid_elements, invalid_ids)

First of all let's save our dataframe in case something will go wrong

In [None]:
data.iloc[invalid_ids,:].to_csv("data_2_invalid.tsv", sep='\t')

In [None]:
pwd

New pattern will make sure the DOI ends up with digit

In [None]:
DOI_no_invalid_elements = [re.findall(r'^10\.\d{4,9}\/[-._;()\/:A-Z0-9]+\d', invalid_element) for invalid_element in invalid_elements]

In [None]:
DOI_no_invalid_elements

In [None]:
status_DOI = [is_valid_doi(DOI_no_invalid_element[0]) for DOI_no_invalid_element in tqdm(DOI_no_invalid_elements) if DOI_no_invalid_element]

In [None]:
status_DOI

In [None]:
DOI_pattern_2 = re.compile(r'^10\.\d{4,9}\/[-._;()\/:A-Z0-9]+\d', flags=re.IGNORECASE)

In [None]:
#DOI_new_backup = DOI_new.copy()

In [None]:
#with open('DOI_new_backup.txt', 'w') as DOI_file:
    #for DOI in DOI_new_backup:
        #DOI_file.write(DOI+'\n')

In [None]:
yet_invalid_DOI = {}
for id_x,DOI in tqdm(enumerate(DOI_new)):
    if DOI == 'invalid':
        DOI_to_cor = DOI_column[id_x]
        DOI_pot_cor = re.findall(DOI_pattern_2, DOI_to_cor)
        if len(DOI_pot_cor)>0:
            if is_valid_doi(DOI_pot_cor[0]):
                DOI_new[id_x] = DOI_pot_cor[0]
            else:
                yet_invalid_DOI[id_x]=DOI_to_cor
        else:
            yet_invalid_DOI[id_x]=DOI_to_cor

We have yet still invalid DOI, let's take a look at them

In [None]:
yet_invalid_DOI

In [None]:
len(DOI_new)

In [None]:
DOI_new.count('invalid')

In [None]:
#DOI_new = DOI_new_backup.copy()

In [None]:
yet_invalid_DOI = {114: '10.1038/ncomms8',
 136: '10.1016/j.mee.2004.03.068Microelectronic Engineering',
 2057: '10.1016/S0026-2692(03)00137-XMicroelectronics Journal',
 2790: '10.1016/j.snb.2004.06.015',
 3095: '10.1016/S1350-4495(99)00047-XInfrared Physics & Technology',
 3178: '10.3389/fpls.2014.00',
 3555: '10.1002/jbio.201700',
 4420: '10.1016/S0925-4005(99)00427-XSensors and Actuators B: Chemical',
 4500: '10.1063/1.4765',
 4681: '10.1016/S0038-092X(00)00013-XSolar Energy'}

There are only 10 of them, so let's check em and append manually

In [None]:
DOI_new[114] = 'NA'
DOI_new[136] = '10.1016/j.mee.2004.03.068'
DOI_new[2057] = '10.1016/S0026-2692(03)00137-X'
DOI_new[2790] = '10.1016/j.snb.2004.06.015'
DOI_new[3095] = '10.1016/S1350-4495(99)00047-X'
DOI_new[3178] = 'NA'
DOI_new[3555] = 'NA'
DOI_new[4420] = '10.1016/S0925-4005(99)00427-X'
DOI_new[4500] = 'NA'
DOI_new[4681] = '10.1016/S0038-092X(00)00013-X'

In [None]:
data['DOI'] = DOI_new

In [None]:
data.to_csv('data_2_DOI_new.tsv', sep='\t')

## Getting missing values

Let's think on how to collect missing values from columns 'Date', 'Journal', 'Title'. We can use that parsing information about the papers in json format through the crossref

In [None]:
pip install crossref-commons

In [None]:
import crossref_commons.retrieval

Let's make a query using doi

In [None]:
tmp = crossref_commons.retrieval.get_publication_as_json('10.1016/j.jallcom.2017.03.270')

Here we can see key-words, using them we can identify which values we need to collect

In [None]:
for key,value in tmp.items():
    print(key)

In [None]:
tmp

Collecting date

In [None]:
tmp['indexed']['date-parts'][0]

In [None]:
publish_date = f"{tmp['indexed']['date-parts'][0][1]}/{tmp['indexed']['date-parts'][0][2]}/{tmp['indexed']['date-parts'][0][0]}"

In [None]:
publish_date

Collecting Title of the article

In [None]:
article_title = tmp['title'][0]

In [None]:
article_title

Collecting the name of the journal

In [None]:
print(tmp['short-container-title'])
print(tmp['container-title'])
print(tmp['original-title'])

Here we can see that few key-words could the the name of the journal, They are kinda the same, so we'll sellect the first one

In [None]:
journal_title = tmp['short-container-title'][0]

In [None]:
journal_title

### Getting rid of rows with missing DOI

In [None]:
#Let's count how many rows have incorrect DOIs:
data['DOI'].value_counts()['NA']

In [None]:
data = data.drop(data[data['DOI'] == 'NA'].index)
len(data)

### Getting rid of duplicates

In [None]:
#let's check if there are any duplicates in the df:
data.duplicated().sum()

In [None]:
duplicates = data[data.duplicated(keep=False)]
print(duplicates)

In [None]:
unique_data = data.drop_duplicates()
len(unique_data)

### Filling in title, journal names and date

In [None]:
#to do: сделать функцию
#проверка1: АПИ вернуло что-то (лен тмп больше 0)
#проверка2: indexed, title, short-container-title ЕСТЬ -> (try (выполнить) exept ('NA'))
#сделать словарь: ключ - DOI, значения - лист(indexed, title, short-container-title) -> pandas.df -> примёрджить к нашей data по DOI

In [None]:
print("Missing values distribution by column: ")
print(unique_data.isnull().mean())

In [None]:
data.head()

Here we can see, that the most missing data we have is in the columns 'measurement_wavelength', 'normalised_name', but we'll fill them a bit later here. Firstly let's look at the other column - 'Title'. For some reason the name of the articles were imported not as the sentances but as the continuous sequence of large letters. Also there is missing values in the 'Date' and 'Journal' columns which we should also fill in. Now when we know that all the rows are unique and valid, let's collect this data from the articles with the function.

In [None]:
unique_data.to_csv('data_copy.csv', sep=',', index=False)

In [None]:
def fill_date_journal_title(data):
    result = {}
    for row in tqdm(data):
        doi, date, journal, title = row
        if not is_valid_doi(doi):
            result[doi] = [date, journal, title]
            continue
        tmp = crossref_commons.retrieval.get_publication_as_json(doi)
        if len(tmp) == 0:
            result[doi] = [date, journal, title]
            continue
        try:
            date_parts = tmp['indexed']['date-parts'][0]
            if date_parts:
                date = f"{date_parts[1]}/{date_parts[2]}/{date_parts[0]}"
        except (KeyError, TypeError, IndexError):
            pass
        try:
            title = tmp['title'][0]
        except (KeyError, TypeError, IndexError):
            pass
        try:
            journal = tmp['short-container-title'][0]
        except (KeyError, TypeError, IndexError):
            pass
        result[doi] = [date, journal, title]
        
        time.sleep(0.125) # Sleep for 0.125 seconds fro not to be banned by API
    return result

In [None]:
unique_data.to_csv('unique_data.csv', index=False, sep=',')

Now let's apply the function to the dataset

In [None]:
data_list = unique_data[['DOI', 'Date', 'Journal', 'Title']].values.tolist()
data_filled = fill_date_journal_title(data_list)

In [None]:
print("Missing values distribution by column: ")
print(unique_data.isnull().mean())

In [None]:
data_filled

### Filling in SMILES

Now let's fill in missing smiles using PubChem

In [None]:
# Set up the PubChem API URL and parameters
url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/"
params = {"property": "CanonicalSMILES"}

# Create an empty list to store the SMILES
smiles = []

# Loop through each compound name and get the SMILES from the PubChem API
for name in tqdm(data_filled["Name"]):
    try:
        # Make a request to the PubChem API and extract the SMILES
        response = requests.get(url + name + "/property/" + params["property"] + "/JSON")
        response_json = response.json()
        smile = response_json["PropertyTable"]["Properties"][0]["CanonicalSMILES"]
        smiles.append(smile)
    except:
        # If there is an error, append a NaN value to the list
        smiles.append(None)

# Add the SMILES to the data frame
data_filled["SMILES"] = smiles

# Save the updated data frame to a new CSV file
data_filled.to_csv("data_with_smiles.tsv", sep='\t')

In [None]:
data_filled = pd.read_csv('data_with_smiles.tsv', sep='\t')

Filling missing smiles in

In [None]:
data_filled["normalised_name"].fillna(data_filled["SMILES"], inplace=True)

In [None]:
data_filled.to_csv("data_filled_.tsv", sep='\t')

In [None]:
from rdkit import Chem
for index, row in tqdm(data_filled.iterrows()):
    # check if normalised_name is missing a value
    if pd.isna(row["normalised_name"]):
        # generate a SMILES string from the Name column
        mol = Chem.MolFromSmiles(row["Name"])
        if mol is not None:
            smiles = Chem.MolToSmiles(mol)
            # update the normalised_name column with the SMILES string
            data_filled.at[index, "normalised_name"] = smiles

In [None]:
print("Missing values distribution by column: ")
print(data_filled.isnull().mean())

Can see that some smiles are still missing, let's try another way

In [None]:
data_filled.to_csv("data_filled.tsv", sep='\t')

In [None]:
from chemspipy import ChemSpider

# initialize the ChemSpider API client
cs = ChemSpider("YOUR_API_KEY")

# iterate over the rows in the DataFrame
for index, row in tqdm(data_filled.iterrows()):
    # check if the normalised_name is missing a value
    if pd.isna(row["normalised_name"]):
        # get the name of the compound
        name = row["Name"]
        # search for the compound in ChemSpider
        results = cs.search(name)
        # check if any results were found
        if len(results) > 0:
            # get the SMILES string of the first result
            smiles = results[0].smiles
            # update the normalised_name column with the SMILES string
            data_filled.at[index, "normalised_name"] = smiles

In [None]:
print("Missing values distribution by column: ")
print(data_filled.isnull().mean())

Saving the result

In [None]:
data_filled.to_csv('data_KONECHNOE.tsv', sep='\t', index=False)

### Fixing raw values

First of all, let's create a regular expression to describe the pattern of this parameter

In [None]:
raw_value_pattern = r'^\d+(\.\d+)?$'

Then, let's check what pattern of the column matches this pattern

In [None]:
import re

# Calculate the percentage of values that match the pattern
matches = data_filled['raw_value'].str.match(raw_value_pattern).sum()
total = len(data_filled['raw_value'])
match_percent = 100 * matches / total
print(f"Percentage of values that match the pattern: {match_percent:.2f}%")

All the values which do not match the pattern we put together

In [None]:
non_matches = data_filled[~data_filled['raw_value'].str.match(raw_value_pattern)]['raw_value']
print("Values that do not match the pattern:")
print(non_matches)

In order to correct incorrect values ​​that do not satisfy the pattern, we find all possible patterns and group them

In [None]:
import re
import difflib

# Define a function to find the closest match to a string in a list
def find_closest_match(string, string_list):
    matches = difflib.get_close_matches(string, string_list, n=1, cutoff=0.8)
    if matches:
        return matches[0]
    else:
        return None

# Define a function to get the pattern for a given value
def get_pattern(value, pattern_groups):
    pattern = re.sub(r'\d+(\.\d+)?', r'X', value)
    match = find_closest_match(pattern, pattern_groups.keys())
    if match:
        return match
    else:
        return pattern

# Group the values by pattern similarity
pattern_groups = {}
for value in data_filled['raw_value']:
    pattern = get_pattern(value, pattern_groups)
    if pattern in pattern_groups:
        pattern_groups[pattern].append(value)
    else:
        pattern_groups[pattern] = [value]

# Create a new column with the patterns
data_filled['patterns'] = data_filled['raw_value'].apply(get_pattern, args=(pattern_groups,))

# Print out the groups
for pattern, values in pattern_groups.items():
    print(f"Pattern {pattern}:")
    print(values)

# Print out the patterns
print(f"Patterns: {list(pattern_groups.keys())}")

In [None]:
data_filled

As the result i have a list of patterns:
['X', 'X±X', 'X []', 'X–X', 'X to X', 'X−X', 'X ± X', 'X ± X', 'X (X)', '∼X', '~X', 'X ± X', 'X-X', 'X, X, X', 'X + iX', 'X (EtOH)', 'X, X', 'X +', 'X,X ± X,X', 'X at X μm', '<X', 'X–X (this)', 'X X X X', 'X*', 'X+Xi', 'X; X; X', 'Xe']
I want you to write a code for each pattern:
- If the pattern is the 'X', remains the original value
- if the pattern is the 'X±X', remains the first X, the X after '±' deleats
- if the pattern is the 'X []', remains the X, the ' []' delleats
- if the pattern is the 'X–X', the mean of two X before and after '–' should be found
- if the pattern is the 'X to X', the the mean of two X before and after ' to ' should be found
- if the pattern is the 'X−X', the mean of two X before and after '−' should be found
- if the pattern is the 'X ± X', remains the first X, the X after ' ± ' should be deleated
- if the pattern is the 'X ± X', remains the first X, the X after ' ± ' should be deleated
- if the pattern is the 'X (X)', the '(' and ')' should be deleated
- if the pattern is the '∼X', remains the X, the '∼' should be deleated
- if the pattern is the '~X', remains the X, the '~' should be deleated
- if the pattern is the 'X ± X', remains the first X, the X after ' ± ' should be deleated
- if the pattern is the 'X-X', the mean of two X before and after '-' should be found
- if the pattern is the 'X, X, X', the mean of three X should be found
- if the pattern is the 'X + iX', remains the first X, the X after ' + ' should be deleated
- if the pattern is the 'X (EtOH)', remain the X, ' (EtOH)' should be deleated
- if the pattern is the 'X, X', the mean of two X before and after ', ' should be found
- if the pattern is the 'X +', remain the X, ' +' should be deleated
- if the pattern is the 'X,X ± X,X', the ',' should be replaced with '.', and then everything after '±' should be deleated
- if the pattern is the 'X at X μm', remain the first X, ' at X μm' should be deleated
- if the pattern is the '<X', remain the X, '<' should be deleated
- if the pattern is the 'X–X (this)', the the mean of two X before and after '–' should be found, ' (this)' should be deleated
- if the pattern is the 'X X X X', the the mean of four X should be found
- if the pattern is the 'X*', remain the X, '*' should be deleated
- if the pattern is the 'X+Xi', the the mean of two X before and after '+' should be found, 'i' should be deleated
- if the pattern is the 'X; X; X', the the mean of three X should be found
- if the pattern is the 'Xe', remain the X, 'e' should be deleated

Let's make a function which will fix all those wrong patterns

In [None]:
def process_value(value, pattern):
    if not value:
        return None
    if pattern == 'X':
        return value
    elif pattern == 'X±X':
        return value.split('±')[0]
    elif pattern == 'X []':
        return value.split(' [')[0]
    elif pattern == 'X–X' or pattern == 'X to X' or pattern == 'X−X' or pattern == 'X-X':
        values = re.findall('\d+(\.\d+)?', value)
        return str(np.mean([float(v) for v in values if v.strip() != '']))
    elif pattern == 'X ± X' or pattern == 'X ± X':
        return value.split('±')[0].split('±')[0]
    elif pattern == 'X (X)':
        return value.split('(')[0].split(')')[0]
    elif pattern == '∼X' or pattern == '~X':
        return value.split('∼')[0].split('~')[0]
    elif pattern == 'X, X, X' or pattern == 'X, X' or pattern == 'X,X ± X,X':
        values = re.findall('\d+(\.\d+)?', value)
        return str(np.mean([float(v) for v in values if v.strip() != '']))
    elif pattern == 'X + iX' or pattern == 'X+Xi':
        return value.split(' +')[0]
    elif pattern == 'X (EtOH)':
        return value.split(' (')[0]
    elif pattern == 'X at X μm':
        return value.split(' at ')[0]
    elif '<' in value and pattern == '<X':
        return float(value.split('<')[1])
    elif pattern == 'X–X (this)':
        values = re.findall('\d+(\.\d+)?', value)
        return str(np.mean([float(v) for v in values if v.strip() != '']))
    elif pattern == 'X X X X':
        values = re.findall('\d+(\.\d+)?', value)
        return str(np.mean([float(v) for v in values if v.strip() != '']))
    elif pattern == 'X*':
        return value.split('*')[0]
    elif pattern == 'X +':
        return value.split(' +')[0]
    elif pattern == 'X; X; X':
        values = re.findall('\d+(\.\d+)?', value)
        return str(np.mean([float(v) for v in values if v.strip() != '']))
    elif pattern == 'Xe':
        return value.split('e')[0]
    else:
        return

In [None]:
patterns = list(data_filled['patterns'])

In [None]:
values = list(data_filled['raw_value'])
import pandas as pd

# create a DataFrame from the list
df = pd.DataFrame({'values': values})

# save the DataFrame to an Excel file
df.to_excel('values.xlsx', index=False)

In [None]:
data.to_csv('data_final.tsv', sep='\t', index=False)

### Collecting CIDs

CID stands for Chemical Identifier. It is a unique identifier assigned to chemical substances to facilitate their identification and tracking. We can collect some information using it later

In [None]:
data = pd.read_csv('./data_final.tsv', sep='\t')

In [None]:
import pubchempy as pcp

# Define a function to get the CID for a given SMILES string
def get_cid(smiles):
    try:
        # Search PubChem database using the SMILES string
        results = pcp.get_compounds(smiles, 'smiles')
        if len(results) > 0:
            # Return the CID number of the first result
            return results[0].cid
    except:
        pass
    return None

# Apply the get_cid function to the 'normalized_name' column and store the results in a new column called 'CID'
data['CID'] = data['normalised_name'].apply(get_cid)

Let's drop all the rows where CID and SMILES are missing, we couldn't find descriptors for them anyways

In [None]:
filtered_df = df[df['normalised_name'].isna() & df['CID'].isna()]
filtered_df

In [None]:
df = df.dropna(subset=['normalised_name', 'CID'], how='all')

In [None]:
df = pd.read_excel('final_dataset.xlsx')

## Getting descriptors

In [None]:
df = pd.read_excel('final_dataset.xlsx')

### RDKit

In [None]:
import pandas as pd
import numpy as np
import pandas as pd
import pubchempy as pcp
from rdkit import Chem
from mordred import Calculator, descriptors
from rdkit.Chem import Descriptors

In [None]:
calc = Calculator(descriptors, ignore_3D=True)
len(calc.descriptors)

In [None]:
def compute_descriptors(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        result = calc(mol)
        return result
    except:
        return None

In [None]:
df = pd.read_excel('final_dataset.xlsx')

In [None]:
# define a function to calculate RDKit descriptors for a given SMILES string
def calc_descriptors(smiles):
    if isinstance(smiles, float):
        # return a dictionary with NaN values for missing or invalid SMILES strings
        desc_dict = {}
        for desc_name, desc_func in Descriptors.descList:
            desc_dict[desc_name] = np.nan
        return desc_dict
    # convert the SMILES string to an RDKit molecule object
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # return a dictionary with NaN values for invalid molecules
        desc_dict = {}
        for desc_name, desc_func in Descriptors.descList:
            desc_dict[desc_name] = np.nan
        return desc_dict
    # calculate a dictionary of all available RDKit descriptors for the molecule
    desc_dict = {}
    for desc_name, desc_func in Descriptors.descList:
        desc_dict[desc_name] = desc_func(mol)
    return desc_dict

# drop any rows with missing values in the 'normalised_name' column
df = df.dropna(subset=['normalised_name'])

# calculate RDKit descriptors for each compound in the 'normalised_name' column
df.loc[:, 'rdkit_desc'] = df['normalised_name'].apply(calc_descriptors)

# convert the resulting dictionary of descriptors to separate columns in the DataFrame
df = pd.concat([df, pd.DataFrame.from_dict(df['rdkit_desc'].tolist())], axis=1)

# drop the original 'rdkit_desc' column since it's no longer needed
df = df.drop(columns=['rdkit_desc'])

In [None]:
df.to_excel("df_after_rdkit.xlsx", index=False)

### Mordred

In [None]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
from mordred import Calculator, descriptors

In [None]:
!pip install session_info
import session_info
session_info.show() 

In [None]:
def All_Mordred_descriptors(data):
    calc = Calculator(descriptors, ignore_3D=False)
    mols = []
    for smi in data:
        if isinstance(smi, float):
            # skip missing or invalid SMILES strings
            mols.append(None)
        else:
            mols.append(Chem.MolFromSmiles(smi))

    # remove any molecules that failed to convert from SMILES
    mols = [mol for mol in mols if mol is not None]

    df = pd.DataFrame(data)
    descriptors_df = calc.pandas(mols)

    # add the descriptor columns to the original dataframe
    for column in descriptors_df.columns:
        df[column] = descriptors_df[column]

    return df

In [None]:
# create descriptor calculator with all descriptors
calc = Calculator(descriptors, ignore_3D=True)

len(calc.descriptors)

In [None]:
# call the function with the input dataframe to get the dataframe with new columns
mordred_descriptors = All_Mordred_descriptors(df['normalised_name'])

In [None]:
mordred_descriptors.shape

In [None]:
df_concat = pd.concat([df, mordred_descriptors.drop(columns=['normalised_name'])], axis=1)  # Concatenate the dataframes horizontally

In [None]:
df_concat.to_csv('df_after_mordred.tsv', sep='\t', index=False)

### PubChem

In [None]:
df = df.rename(columns={'CID': 'cid'})

In [None]:
df = df.rename(columns={'normalised_name': 'isomeric_smiles'})

In [None]:
# create an empty list to store the descriptors
descriptors = []

# iterate over the CID column in the dataframe
for i, cid in tqdm(df['cid'].items()):
    
    # check if the value is missing or not
    if pd.isna(cid):
        descriptors.append({'xlogp': None, 'tpsa': None, 'exact_mass': None, 'molecular_weight': None, 'complexity': None, 'index': i})
    else:
        try:
            # convert float value to integer
            cid = int(cid)
            
            # search for the compound using the CID
            compound = pcp.Compound.from_cid(cid)

            # extract the descriptors and add them to the list
            descriptors.append(compound.to_dict(properties=['xlogp', 'tpsa', 'exact_mass', 'molecular_weight', 'complexity']) | {'index': i})

            # add a delay between requests to avoid exceeding rate limit
            time.sleep(0.3)
        except (ValueError, pcp.PubChemHTTPError):
            # if there's an error, add missing descriptors and the index to the list
            descriptors.append({'xlogp': None, 'tpsa': None, 'exact_mass': None, 'molecular_weight': None, 'complexity': None, 'index': i})
    
# create a new dataframe with the descriptors
df_descriptors = pd.DataFrame(descriptors)

# set the index of df_descriptors to the 'index' column
df_descriptors.set_index('index', inplace=True)

# merge the two dataframes by index
df = df.merge(df_descriptors, left_index=True, right_index=True)

df.to_csv('df_all_descriptors.tsv', sep='\t', index=False)