In [1]:
import pandas as pd
import spacy
import re
import os

In [2]:
# Read in Metadata and handle IDs
df = pd.read_csv('metadata.csv')
df[['Taxon Object ID', 'IMG Submission ID']] = df[['Taxon Object ID', 'IMG Submission ID']].fillna(0).astype(int)

# Handle Locations
def extract_location_from_text(input_text):
    doc = nlp(input_text)
    location_names = [entity.text for entity in doc.ents if entity.label_ == 'GPE']

    return location_names

# Find all rows with null values in 'Geographic Location' that have non-null values in 'Latitude' and 'Longitude'
null_locations = list(df[df['Geographic Location'].isnull()].index)

# Use spacy nlp to extract locations from project names and impute, where possible
nlp = spacy.load("en_core_web_sm")
for idx in list(df['Study Name (Proposal Name)'].iloc[null_locations].index):
    extracted_loc = extract_location_from_text(str(df['Study Name (Proposal Name)'].iloc[idx]))

    if extracted_loc and extracted_loc != ['nan']:
        extracted_loc = ' '.join(extracted_loc)
        df.loc[df.index[idx], 'Geographic Location'] = extracted_loc

    # Check if there are 'various locations' listed and impute if so
    elif not extracted_loc:
        if re.search('various locations', str(df['Study Name (Proposal Name)'].iloc[idx])):
            df.loc[df.index[idx], 'Geographic Location'] = 'various locations'

# Read in Portals
portals_df = pd.read_csv('portals.csv', sep = '\t')

  df = pd.read_csv('metadata.csv')


In [3]:
# Join Metadata with Portal Names
df = df.join(portals_df.set_index('taxon_oid'), on='Taxon Object ID')
df.head()

Unnamed: 0,Study Name (Proposal Name),Sample Name,Taxon Object ID,IMG Submission ID,\nGOLD IDs in IMG Database,GOLD Analysis Project Type,Submission Type,JGI Analysis Project Type,SRA ID,SRA Run,...,Proportal Isolation,Proportal Ocean,Proportal Station,Proportal WOA Nitrate,Proportal WOA Salinity,Proportal WOA Temperature,GenBank Anomalous Assembly,NCBI Assembly Accession,portal_name,downloaded
0,Marine microbial communities from different lo...,"Marine microbial communities from Red Sea, Sau...",3300038838,219915,\nStudy ID: Gs0132945 Project ID: Gp0274035 ...,Metagenome Analysis,Primary,Metagenome Analysis,SRX1097578,SRR2102997,...,,,,,,,,,IMG_3300038838,no
1,Agricultural soil microbial communities from A...,Agricultural soil microbial communities from s...,3300064906,315709,\nStudy ID: Gs0161729 Project ID: Gp0758376 ...,Metagenome Analysis,Primary,Metagenome Analysis,,,...,,,,,,,,,IMG_3300064906,no
2,Freshwater microbial mat bacterial communities...,Freshwater microbial mat bacterial communities...,3300015360,123175,\nStudy ID: Gs0127369 Project ID: Gp0191361 ...,Metagenome Analysis,Primary,Metagenome Analysis,"SRX3539175, SRX3539174",,...,,,,,,,,,OliLak19BULKMAT1_FD,no
3,Soil microbial communities from Los Alamos Nat...,Soil microbial communities from Los Alamos Nat...,3300053171,270981,\nStudy ID: Gs0153999 Project ID: Gp0565207 ...,Metagenome Analysis,Primary,Metagenome Analysis,,,...,,,,,,,,,S17hydmetagenome_6_FD,no
4,Tropical forest soil microbial communities fro...,Tropical forest soil microbial communities fro...,3300000729,11900,\nStudy ID: Gs0075432 Project ID: Gp0054556 ...,Metagenome Analysis,Reanalysis,Metagenome Analysis,SRX4340826,,...,,,,,,,,,LuqExpMetageno35_FD,no


In [4]:
# Remove rows without Study Names
df = df[-df['Study Name (Proposal Name)'].isnull()]

In [5]:
df['Study ID'] = None
df['Project ID'] = None
df['Analysis ID'] = None

for mult_ids in list(df['\nGOLD IDs in IMG Database']):
    id_list = mult_ids.strip('\n').split('\xa0\xa0')

    for id_ in id_list:
        if id_ != '':
            id_type = id_.split(': ')[0]
            id_num = id_.split(': ')[1]
            df.loc[df['\nGOLD IDs in IMG Database'] == mult_ids, id_type] = id_num
            

In [6]:
# Add contigs.fna files to metadata
### CHANGE THIS PATH
downloaded = os.listdir('/Users/kaenurge/Desktop/G2 Lab/crisprdata_test/')

### COMMENT THIS OUT IF RUNNING ANOTHER BATCH
df['files downloaded'] = None

for file in downloaded:
    project = file.split('__')[0]
    if project in list(df['portal_name']):
        idx = df.loc[df['portal_name'] == project].index

        # Handle cases where there are multiple projects with the same name
        if len(idx) > 1:
            id_cols = ['Study ID', 'Project ID', 'Analysis ID']
            file_name_extended = file.split('__')[1]
            file_name = file_name_extended.split('_')[0]
            condition_mask = df[id_cols].apply(lambda col: col.str.contains(file_name)).any(axis=1) #if id is not found, make idx = idx[0] and make another column to note it
            idx = df[condition_mask].index
        
        idx = int(idx[0])

        # Append file name, handle cases if more than 1 file is downloaded for the same project
        if df.at[idx, 'files downloaded'] == None:
            df.at[idx, 'files downloaded'] = file
        else:
            df.at[idx, 'files downloaded'] += f', {file}'

In [16]:
# a = '003ER18SCSDNA_FD__Ga0455849_contigs_ftd.fna'
# b = a.split('__')[1]
# b.split('_')[0]

'Ga0455849'

In [11]:
# id_cols = ['Study ID', 'Project ID', 'Analysis ID']
# condition_mask = df[id_cols].apply(lambda col: col.str.contains('003ER18SCSDNA_FD')).any(axis=1)
# G2 Lab/crisprdata_test/003ER18SCSDNA_FD__Ga0455849_contigs_ftd.fna
# df[condition_mask].index

Index([], dtype='int64')

In [7]:
df.head()

Unnamed: 0,Study Name (Proposal Name),Sample Name,Taxon Object ID,IMG Submission ID,\nGOLD IDs in IMG Database,GOLD Analysis Project Type,Submission Type,JGI Analysis Project Type,SRA ID,SRA Run,...,Proportal WOA Salinity,Proportal WOA Temperature,GenBank Anomalous Assembly,NCBI Assembly Accession,portal_name,downloaded,Study ID,Project ID,Analysis ID,files downloaded
0,Marine microbial communities from different lo...,"Marine microbial communities from Red Sea, Sau...",3300038838,219915,\nStudy ID: Gs0132945 Project ID: Gp0274035 ...,Metagenome Analysis,Primary,Metagenome Analysis,SRX1097578,SRR2102997,...,,,,,IMG_3300038838,no,Gs0132945,Gp0274035,Ga0237899,
1,Agricultural soil microbial communities from A...,Agricultural soil microbial communities from s...,3300064906,315709,\nStudy ID: Gs0161729 Project ID: Gp0758376 ...,Metagenome Analysis,Primary,Metagenome Analysis,,,...,,,,,IMG_3300064906,no,Gs0161729,Gp0758376,Ga0613092,
2,Freshwater microbial mat bacterial communities...,Freshwater microbial mat bacterial communities...,3300015360,123175,\nStudy ID: Gs0127369 Project ID: Gp0191361 ...,Metagenome Analysis,Primary,Metagenome Analysis,"SRX3539175, SRX3539174",,...,,,,,OliLak19BULKMAT1_FD,no,Gs0127369,Gp0191361,Ga0163144,
3,Soil microbial communities from Los Alamos Nat...,Soil microbial communities from Los Alamos Nat...,3300053171,270981,\nStudy ID: Gs0153999 Project ID: Gp0565207 ...,Metagenome Analysis,Primary,Metagenome Analysis,,,...,,,,,S17hydmetagenome_6_FD,no,Gs0153999,Gp0565207,Ga0494674,
4,Tropical forest soil microbial communities fro...,Tropical forest soil microbial communities fro...,3300000729,11900,\nStudy ID: Gs0075432 Project ID: Gp0054556 ...,Metagenome Analysis,Reanalysis,Metagenome Analysis,SRX4340826,,...,,,,,LuqExpMetageno35_FD,no,Gs0075432,Gp0054556,Ga0001958,


In [8]:
df.to_csv('final_metadata.csv', index = False)

In [101]:
# Add contigs.fna files to metadata
### CHANGE THIS PATH
directory_path = '/Users/kaenurge/Desktop/G2 Lab/crisprdata copy 2/'
cctyped = os.listdir(directory_path)

portals = pd.read_csv('portals.csv', sep='\t')
portals.rename(columns = {'taxon_oid': 'Taxon Object ID'}, inplace = True)

df = pd.DataFrame()
df['Portal Name'] = None
df['ID'] = None
df['CRISPR'] = None

df.head()

Unnamed: 0,Portal Name,ID,CRISPR


In [102]:
for directory in cctyped:
    file_path = directory_path + str(directory)

    # Loop through directories created by cctyper
    if os.path.isdir(file_path):
        project = directory.split('__')[0]
        id_ = directory.split('__')[1]
        cctype_output = os.listdir(file_path)

        # Check if CRISPRs were found
        if 'crisprs_all.tab' in cctype_output:
            file_path += '/crisprs_all.tab'
            all_crisprs = pd.read_csv(file_path, sep = '\t')
            temp_df = pd.DataFrame(all_crisprs)
            temp_df['Portal Name'] = project
            temp_df['ID'] = id_
            
            df = pd.concat([df, temp_df])

In [103]:
df

Unnamed: 0,Portal Name,ID,CRISPR,Contig,Start,End,Consensus_repeat,N_repeats,Repeat_len,Spacer_len_avg,Repeat_identity,Spacer_identity,Spacer_len_sem,Trusted,Prediction,Subtype,Subtype_probability
0,182ER18SCSDNA_FD,Ga0454634,Ga0454634_0031049_1,Ga0454634_0031049,815.0,979.0,GTCGCACCCCACGCGGGTGCGTGAATTGAAAC,3.0,32.0,34.5,100.0,50.0,0.5,True,I-C,I-C,1.0
1,182ER18SCSDNA_FD,Ga0454634,Ga0454634_0053810_2,Ga0454634_0053810,105.0,804.0,GGTTCCCCCGCCTGCGCGGGGATGGCCCC,12.0,29.0,32.0,89.5,45.8,0.0,True,I-E,I-E,1.0
2,182ER18SCSDNA_FD,Ga0454634,Ga0454634_0080066_3,Ga0454634_0080066,1.0,242.0,GCGGCGAGGACCGCGCTCGCCGACG,4.0,25.0,47.3,79.3,52.7,1.5,True,Unknown,I-C,0.437
3,182ER18SCSDNA_FD,Ga0454634,Ga0454634_0126594_4,Ga0454634_0126594,264.0,518.0,ATCTCCGTCGACGTTCGTCGGCGGCCCCATTGAAGC,4.0,36.0,37.0,100.0,47.2,0.6,True,I-G,I-G,0.999
