# Bring in Modules

In [17]:
import pandas as pd
import random
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.dates import DateFormatter
import numpy as np
pd.options.mode.chained_assignment = None 

# Bring in Data

Note: The first `cord_19_metadata` is the full data, while the second one is the deduplicated one

In [2]:
#cord_19_metadata = pd.read_csv("/Volumes/LaCie/Dispersed Volunteer Research Network/Project 21/Data/2021-03-29/metadata.csv")
cord_19_metadata = pd.read_csv("/Volumes/LaCie/Dispersed Volunteer Research Network/Project 21/Data/210417_deduplicated_data.csv")
fda_biologics = pd.read_csv("/Volumes/LaCie/Dispersed Volunteer Research Network/Project 21/Data/FDA Products List (Last Updated ~September 2020) - Sheet1.csv")
#drug_date_df = pd.read_csv("/Users/cpollack/project_21_clinical_trials_during_covid_19_pandemic/cord_19/210415_drug_date_df.csv")

  interactivity=interactivity, compiler=compiler, result=result)


# Functions

In [3]:
def find_drug(abstract, drug, date, cord_uid):
    '''
        Function that will identify whether the drug of interest is in the abstract of interest and, if so, return a formatted dictionary that can be appeneded to a dataframe
        Parameters:
            abstract (str): The abstract of interest (assumed to be all lowercase)
            drug (str): The drug of interest (assumed to be all lowercase)
            date (str): The date that the manuscript was published (will only be used if drug in abstract)"
            cord_uid (str): The unique identifier that we can use to link the abstract back to the CORD-19 metadata file (will only be used if drug in abstract)"
       
       Return:
            dict_return (dict): A dictionary with the drug, date of publication, and cord_uid for the artcile
     '''
    if drug == "chloroquine": #Adding this extra step or else it will be duplicated with hydroxychloroquine
        if f" {drug}" in abstract:
            dict_return = {"drug": drug,
                           "date": date,
                           "cord_uid": cord_uid}
            return dict_return
    
    elif drug in abstract:
        dict_return = {"drug": drug,
                       "date": date,
                       "cord_uid": cord_uid}
        return dict_return

# Creating Processed List of Biologics

In [21]:
fda_biologics["processed"] = fda_biologics.PRODUCT.str.replace("_", " ")
fda_biologics.processed = fda_biologics.processed.str.lower()
fda_biologics_list = [y for x in fda_biologics.processed.tolist() for y in x.split('/')]

# COVID-19 Related Term

Terms  pulled from NIH MeSH: https://www.ncbi.nlm.nih.gov/mesh/2052179

In [5]:
covid_19_terms = ["COVID 19", "COVID-19", "2019-nCoV", "2019 nCoV", "Coronavirus Disease-19", "Coronavirus Disease 19", 
                  " 2019 Novel Coronavirus", "COVID19", "Coronavirus Disease 2019", "SARS Coronavirus 2 Infection", 
                  "SARS-CoV-2", "SARS CoV 2"]

# Deduplication

## Deduplication Based on CORD_UID

In [6]:
cord_19_metadata_duplicated = cord_19_metadata[cord_19_metadata.duplicated(["cord_uid"], keep = False)]
cord_19_metadata_duplicated #46526 duplicated values

# Drop papers missing an abstract, publish time, or journal and run again
cord_19_metadata = cord_19_metadata.dropna(subset = ["abstract", "publish_time", "journal"]) #334,601
cord_19_metadata_duplicated = cord_19_metadata[cord_19_metadata.duplicated(["cord_uid"], keep = False)]
print(len(cord_19_metadata_duplicated))
#23,547 duplicates
cord_19_metadata = cord_19_metadata.drop_duplicates(subset = "cord_uid", keep = "last")
print(len(cord_19_metadata)) #322,243

23547
322243


## Further deduplication based on PMCID

Note: No duplicates by `sha`, `pmcid`, `doi`, `who_covidence_id`, `arxiv_id` 

In [7]:
cord_19_metadata_duplicated = cord_19_metadata.dropna(subset = ["pubmed_id"])[cord_19_metadata.dropna(subset = ["pubmed_id"]).duplicated(["pubmed_id"], keep = "last")]
len(cord_19_metadata_duplicated) #108 duplicated values

cord_19_metadata = cord_19_metadata[(~cord_19_metadata.duplicated(["pubmed_id"], keep = "last")) | (cord_19_metadata['pubmed_id'].isnull())]
len(cord_19_metadata) #322,135

322135

## Further deduplication based on s2

### Two duplicate s2 values

In [9]:
cord_19_metadata_duplicated = cord_19_metadata.dropna(subset = ["s2_id"])[cord_19_metadata.dropna(subset = ["s2_id"]).duplicated(["s2_id"], keep = False)]
print(len(cord_19_metadata_duplicated)) #154,322
s2_id_list = set(cord_19_metadata_duplicated.s2_id)
print(len(s2_id_list)) #75,746 unique s2 id's 
cord_19_metadata_duplicated = cord_19_metadata_duplicated.sort_values(["s2_id", "publish_time"])
s2_id_multi = []
cord_19_metadata_duplicated["flag"] = -1

#for i in range(0, len(cord_19_metadata_duplicated), 2):
for s2_id in s2_id_list:
    temp_data = cord_19_metadata_duplicated[cord_19_metadata_duplicated.s2_id == s2_id]
    if len(temp_data) == 2:
        i1, i2 = temp_data.index #Extract the indices
        if len(temp_data.publish_time.loc[i1]) == len(temp_data.publish_time.loc[i2]): #If the lengths are equal
            try:
                int((temp_data.publish_time.loc[i1]))
            except ValueError:
                print(f"Cannot convert to int at {i1} - converting to date instead")
                old_date = pd.to_datetime(temp_data.publish_time.loc[i1])
                new_date = pd.to_datetime(temp_data.publish_time.loc[i2])
                if old_date <= new_date:
                    cord_19_metadata_duplicated.flag.loc[i1] = 1
                else:
                    cord_19_metadata_duplicated.flag.loc[i2] = 1

            else:
                if int(temp_data.publish_time.loc[i1]) <= int(temp_data.publish_time.loc[i2]):
                # Pick the lowest date as the primary date to keep 
                    cord_19_metadata_duplicated.flag.loc[i1] = 1
                else:
                    cord_19_metadata_duplicated.flag.loc[i2] = 1

        elif len(temp_data.publish_time.loc[i1]) > len(temp_data.publish_time.loc[i2]): #If the first one has a longer date (pick that one)
            cord_19_metadata_duplicated.flag.loc[i1] = 1
        else:
            cord_19_metadata_duplicated.flag.loc[i2] = 1 
            
    else:
        s2_id_multi.append(s2_id)
        #print(cord_19_metadata_duplicated.s2_id.iloc[i], cord_19_metadata_duplicated.s2_id.iloc[i+1])
        #print(cord_19_metadata_duplicated.publish_time.iloc[i], cord_19_metadata_duplicated.publish_time.iloc[i+1])
        #print(len(cord_19_metadata_duplicated.publish_time.iloc[i]), len(cord_19_metadata_duplicated.publish_time.iloc[i+1]))
    #cord_19_metadata_duplicated.sort_values(["s2_id", "publish_time"]).head(20)

154332
75746
Cannot convert to int at 423077 - converting to date instead
Cannot convert to int at 137946 - converting to date instead
Cannot convert to int at 422810 - converting to date instead
Cannot convert to int at 139663 - converting to date instead
Cannot convert to int at 473795 - converting to date instead
Cannot convert to int at 443946 - converting to date instead
Cannot convert to int at 438195 - converting to date instead
Cannot convert to int at 476298 - converting to date instead
Cannot convert to int at 206726 - converting to date instead
Cannot convert to int at 398617 - converting to date instead
Cannot convert to int at 133671 - converting to date instead


### Triplicate s2 values

In [10]:
s2_id_four = []
for s2_id in s2_id_multi:
    temp_data = cord_19_metadata_duplicated[cord_19_metadata_duplicated.s2_id == s2_id]
    if len(temp_data) == 3:
        #print(s2_id)
        keep_longest = temp_data[temp_data.publish_time.apply(lambda x: len(str(x))==10)] 
        if len(keep_longest) == 1: #If there is only one full date
            cord_19_metadata_duplicated.flag.loc[keep_longest.index] = 1
        elif len(keep_longest) == 2: #If there are two full dates
            i1, i2 = keep_longest.index
            old_date = pd.to_datetime(keep_longest.publish_time.loc[i1])
            new_date = pd.to_datetime(keep_longest.publish_time.loc[i2])
            if old_date <= new_date:
                cord_19_metadata_duplicated.flag.loc[i1] = 1
            else:
                cord_19_metadata_duplicated.flag.loc[i2] = 1
        elif len(keep_longest) == 3: #If there are three full dates
            i1, i2, i3 = keep_longest.index
            min_date = min(pd.to_datetime(keep_longest.publish_time.loc[i1]), pd.to_datetime(keep_longest.publish_time.loc[i2]), pd.to_datetime(keep_longest.publish_time.loc[i3]))
            keep_oldest_index = keep_longest[keep_longest.publish_time == min_date].index[0]
            cord_19_metadata_duplicated.flag.loc[keep_oldest_index] = 1
        else: # If there is no long date, then they must all be years
            i1, i2, i3 = temp_data.index
            min_date = min(int(temp_data.publish_time.loc[i1]), int(temp_data.publish_time.loc[i2]), int(temp_data.publish_time.loc[i3]))
            temp_data_index = temp_data[temp_data.publish_time == str(min_date)].index[0]
            cord_19_metadata_duplicated.flag.loc[temp_data_index] = 1
    else:
        s2_id_four.append(s2_id)

### Quadruple s2 values

In [11]:
s2_id_five = []
for s2_id in s2_id_four:
    temp_data = cord_19_metadata_duplicated[cord_19_metadata_duplicated.s2_id == s2_id]
    if len(temp_data) == 4:
        #print(s2_id)
        keep_longest = temp_data[temp_data.publish_time.apply(lambda x: len(str(x))==10)] 
        if len(keep_longest) == 1: #If there is only one full date
            cord_19_metadata_duplicated.flag.loc[keep_longest.index] = 1
        elif len(keep_longest) == 2: #If there are two full dates
            i1, i2 = keep_longest.index
            old_date = pd.to_datetime(keep_longest.publish_time.loc[i1])
            new_date = pd.to_datetime(keep_longest.publish_time.loc[i2])
            if old_date <= new_date:
                cord_19_metadata_duplicated.flag.loc[i1] = 1
            else:
                cord_19_metadata_duplicated.flag.loc[i2] = 1
        elif len(keep_longest) == 3: #If there are three full dates
            i1, i2, i3 = keep_longest.index
            min_date = min(pd.to_datetime(keep_longest.publish_time.loc[i1]), pd.to_datetime(keep_longest.publish_time.loc[i2]), pd.to_datetime(keep_longest.publish_time.loc[i3]))
            keep_oldest_index = keep_longest[keep_longest.publish_time == min_date].index[0]
            cord_19_metadata_duplicated.flag.loc[keep_oldest_index] = 1
        elif len(keep_longest) == 4: #If there are three full dates
            i1, i2, i3, i4 = keep_longest.index
            min_date = min(pd.to_datetime(keep_longest.publish_time.loc[i1]), pd.to_datetime(keep_longest.publish_time.loc[i2]), pd.to_datetime(keep_longest.publish_time.loc[i3]), pd.to_datetime(keep_longest.publish_time.loc[i4]))
            keep_oldest_index = keep_longest[keep_longest.publish_time == min_date].index[0]
            cord_19_metadata_duplicated.flag.loc[keep_oldest_index] = 1
        else: # If there is no long date, then they must all be years
            i1, i2, i3, i4 = temp_data.index
            min_date = min(int(temp_data.publish_time.loc[i1]), int(temp_data.publish_time.loc[i2]), int(temp_data.publish_time.loc[i3]), int(temp_data.publish_time.loc[i4]))
            temp_data_index = temp_data[temp_data.publish_time == str(min_date)].index[0]
            cord_19_metadata_duplicated.flag.loc[temp_data_index] = 1
    else:
        s2_id_five.append(s2_id)

### Quintuple s2 values

In [12]:
s2_id_six = []
for s2_id in s2_id_five:
    temp_data = cord_19_metadata_duplicated[cord_19_metadata_duplicated.s2_id == s2_id]
    if len(temp_data) == 5:
        #print(s2_id)
        keep_longest = temp_data[temp_data.publish_time.apply(lambda x: len(str(x))==10)] 
        if len(keep_longest) == 1: #If there is only one full date
            cord_19_metadata_duplicated.flag.loc[keep_longest.index] = 1
        elif len(keep_longest) == 2: #If there are two full dates
            i1, i2 = keep_longest.index
            old_date = pd.to_datetime(keep_longest.publish_time.loc[i1])
            new_date = pd.to_datetime(keep_longest.publish_time.loc[i2])
            if old_date <= new_date:
                cord_19_metadata_duplicated.flag.loc[i1] = 1
            else:
                cord_19_metadata_duplicated.flag.loc[i2] = 1
        elif len(keep_longest) == 3: #If there are three full dates
            i1, i2, i3 = keep_longest.index
            min_date = min(pd.to_datetime(keep_longest.publish_time.loc[i1]), pd.to_datetime(keep_longest.publish_time.loc[i2]), pd.to_datetime(keep_longest.publish_time.loc[i3]))
            keep_oldest_index = keep_longest[keep_longest.publish_time == min_date].index[0]
            cord_19_metadata_duplicated.flag.loc[keep_oldest_index] = 1
        elif len(keep_longest) == 4: #If there are three full dates
            i1, i2, i3, i4 = keep_longest.index
            min_date = min(pd.to_datetime(keep_longest.publish_time.loc[i1]), pd.to_datetime(keep_longest.publish_time.loc[i2]), pd.to_datetime(keep_longest.publish_time.loc[i3]), pd.to_datetime(keep_longest.publish_time.loc[i4]))
            keep_oldest_index = keep_longest[keep_longest.publish_time == min_date].index[0]
            cord_19_metadata_duplicated.flag.loc[keep_oldest_index] = 1
        elif len(keep_longest) == 5: #If there are three full dates
            i1, i2, i3, i4, i5 = keep_longest.index
            min_date = min(pd.to_datetime(keep_longest.publish_time.loc[i1]), pd.to_datetime(keep_longest.publish_time.loc[i2]), pd.to_datetime(keep_longest.publish_time.loc[i3]), pd.to_datetime(keep_longest.publish_time.loc[i4]), pd.to_datetime(keep_longest.publish_time.loc[i5]))
            keep_oldest_index = keep_longest[keep_longest.publish_time == min_date].index[0]
            cord_19_metadata_duplicated.flag.loc[keep_oldest_index] = 1
        else: # If there is no long date, then they must all be years
            i1, i2, i3, i4, i5 = temp_data.index
            min_date = min(int(temp_data.publish_time.loc[i1]), int(temp_data.publish_time.loc[i2]), int(temp_data.publish_time.loc[i3]), int(temp_data.publish_time.loc[i4]), int(temp_data.publish_time.loc[i5]))
            temp_data_index = temp_data[temp_data.publish_time == str(min_date)].index[0]
            cord_19_metadata_duplicated.flag.loc[temp_data_index] = 1
    else:
        s2_id_six.append(s2_id)

### Final s2 deduplication step

In [13]:
cord_19_metadata_duplicated_s2 = cord_19_metadata_duplicated[cord_19_metadata_duplicated.flag == -1]
print(len(cord_19_metadata_duplicated_s2)) #78,586 to drop
cord_19_metadata_duplicated_s2 = cord_19_metadata_duplicated_s2.drop("flag", axis = 1)
#cord_19_metadata_duplicated_s2.head()
cord_19_metadata = cord_19_metadata[~cord_19_metadata.isin(cord_19_metadata_duplicated_s2)]
len(cord_19_metadata) #322,135

78586


322135

# Processing Abstract Data

In [14]:
cord_19_metadata["processed_abstract"] = cord_19_metadata.abstract.str.lower()
print(cord_19_metadata.shape) #322,135 rows before
cord_19_metadata = cord_19_metadata[cord_19_metadata.processed_abstract.notna()]
print(cord_19_metadata.shape) #243,549 rows after removing rows that had duplicated s2 values

(322135, 20)
(243549, 20)


# Exporting Deduplicated CORD-19 Data (and 250 random abstracts)

## Deduplicated Data

In [16]:
cord_19_metadata.to_csv("/Volumes/LaCie/Dispersed Volunteer Research Network/Project 21/Data/210417_deduplicated_data.csv")

## Random Rows

In [20]:
random.seed(110295)
cord_19_metadata_random = cord_19_metadata.sample(n = 250, random_state=110295)
print(cord_19_metadata_random.shape)
#print(cord_19_metadata_random.head())
cord_19_metadata_random = cord_19_metadata_random["abstract"]
print(cord_19_metadata_random.head())
cord_19_metadata_random.to_csv("/Volumes/LaCie/Dispersed Volunteer Research Network/Project 21/Data/210417_random_cord19_250.csv")

(250, 20)
75256     Non-contact infrared thermometry of facial ski...
419602    The macrodomain of nsP3 (nsP3MD) is highly con...
461886    Recent revolution of cryo-electron microscopy ...
93304     BACKGROUND Gender imbalances in academia have ...
394485    OBJECTIVE: During COVID-19 pandemic, the insti...
Name: abstract, dtype: object


# Creating COVID-Specific Data

In [251]:
cord_19_covid = cord_19_metadata[cord_19_metadata["processed_abstract"].str.contains('|'.join([x.lower() for x in covid_19_terms]))]
print(cord_19_covid.shape) #103,836 when only keeping those that directly mention COVID-19"

(103836, 20)


# Filtering to Only Include Biologics of Interest

In [254]:
cord_19_covid_fda = cord_19_covid[cord_19_covid["processed_abstract"].str.contains('|'.join(fda_biologics_list))]
print(cord_19_covid_fda.shape) #6,552 when only keeping those that directly mention COVID-19 and one of the biologics
cord_19_covid_fda = cord_19_covid_fda.reset_index(drop = True)

(6552, 20)


# Creating new drug-specific dataframe

In [278]:
drug_date_df = pd.DataFrame()

for row in range(0, len(cord_19_covid_fda)):
    if row % 1000 == 0:
        print(row)
    for drug in fda_biologics_list:
        row_to_append = find_drug(cord_19_covid_fda.processed_abstract[row], 
                                  drug, 
                                  cord_19_covid_fda.publish_time[row], 
                                  cord_19_covid_fda.cord_uid[row])
        
        drug_date_df = drug_date_df.append(row_to_append, ignore_index = True)

0
1000
2000
3000
4000
5000
6000


In [279]:
drug_date_df.to_csv("210415_drug_date_df.csv")

# Processing drug_date_df (can start here since output is saved)

## Converting date column into date

In [263]:
#12,094 rows, suggests some abstracts mention multiple drugs
print(len(drug_date_df))
drug_date_df["date_date"] = pd.to_datetime(drug_date_df["date"], format = "%Y-%m-%d")

12094


## Creating new dataframe that doesn't include hydroxychloroquine (may be duplicated)

In [None]:
drug_date_df_no_hydroxy = drug_date_df.loc[drug_date_df["drug"] != "hydroxychloroquine"] 
drug_date_df_no_hydroxy.shape
#23,848 rows to #19,824 rows

## Remove rows that don't have a date or the date is only the year

In [None]:
drug_date_df_no_hydroxy = drug_date_df_no_hydroxy[(drug_date_df_no_hydroxy.date != "2019") & 
                                                  (drug_date_df_no_hydroxy.date != "2020") & 
                                                  (drug_date_df_no_hydroxy.date != "2021")&
                                                  (drug_date_df_no_hydroxy.date.notnull())]
drug_date_df_no_hydroxy #9,304 rows

# Visualization Over time

## Preprocessing for generic date vs. drug

In [None]:
drug_date_df_no_hydroxy = drug_date_df_no_hydroxy.sort_values(by = "date_date")
date_count = drug_date_df_no_hydroxy.groupby(["date_date"]).agg(func = "count")
date_count = date_count.reset_index()
date_count = date_count[date_count["date_date"] <= "2021-03-29"]
date_count

## Actual Visualization

In [None]:
fig, ax = plt.subplots(figsize=(16, 6))
sns.set_style("ticks")
plot = sns.lineplot(data = date_count,
             x = "date_date",
             y = "drug")
sns.despine(offset = 10)
ax.set(xlabel="Date", 
       ylabel="Count",
       title="Variations in Publications Mentioning Biologics in CORD-19 Over Time")
ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=4))
ax.xaxis.set_major_formatter(DateFormatter("%m/%y"))
ax.figure.savefig("210403_drug_time.tiff")

# Drug-Specific Visualization Over Time

## Preprocessing

In [None]:
drug_date_count = drug_date_df_no_hydroxy.groupby(["date_date", "drug"]).agg(func = "count")
drug_date_count = drug_date_count.reset_index()
drug_date_count = drug_date_count[drug_date_count["date_date"] <= "2021-03-29"]
drug_date_count

## Plotting

In [None]:
sns.set_style("ticks")
plot = sns.FacetGrid(drug_date_count,
                    col = "drug",
                    hue = "drug",
                    col_wrap = 5)
plot.map_dataframe(sns.lineplot, 
         x = "date_date", 
         y = "date")
sns.despine(offset = 10)

axes = plot.axes.flatten()
for ax in axes:
    ax.set_xlabel("Date")
    ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=12))
    ax.xaxis.set_major_formatter(DateFormatter("%m/%y"))
    ax.tick_params(axis = "x",
                   labelsize = 10,
                   rotation = 45)
    ax.set_ylabel("Count")
plot.tight_layout()
plot.set_titles(col_template = "{col_name}")
#ax.set(xlabel="Date", 
#       ylabel="Count",
#       title="Variations in Publications Mentioning Biologics in CORD-19 Over Time")
#ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=4))
#ax.xaxis.set_major_formatter(DateFormatter("%m/%y"))
plot.savefig("210403_drug_facet.tiff")

# Exporting counts of each drug over time

In [None]:
drug_date_count = drug_date_count.drop(["Unnamed: 0", "cord_uid"], axis = 1)
drug_date_count.columns = ["date", "drug", "count"]

In [None]:
drug_date_count_no_hydroxy = drug_date_count.merge(drug_date_df_no_hydroxy,
                                                   left_on = ["date", "drug"],
                                                   right_on = ["date_date", "drug"])
drug_date_count_no_hydroxy = drug_date_count_no_hydroxy.drop(["Unnamed: 0", "date_y", "date_date"], axis = 1)
drug_date_count_no_hydroxy.columns = ["date", "drug", "count", "cord_uid"]

In [None]:
drug_date_count_no_hydroxy.to_csv("210403_drug_date_count_no_hydroxy.csv")

# Preprints vs. Prints

## Generate Preprint List

In [None]:
journal_list = cord_19_covid_fda.journal.unique().tolist()
journal_list = [str(x).lower() for x in journal_list]
preprint_list = [j for j in journal_list if "rxiv" in j]

## Only keep those that have a journal

In [None]:
cord_19_covid_fda_journal = cord_19_covid_fda[cord_19_covid_fda.journal.notna()]
cord_19_covid_fda_journal.shape #11,667

## Adding Flag

In [None]:
cord_19_covid_fda_journal["journal_processed"] = cord_19_covid_fda_journal.journal.str.lower()
cord_19_covid_fda_journal["preprint_indicator"] = cord_19_covid_fda.journal_processed.isin(preprint_list).astype(int)

## Preprocessing for Visualization

In [None]:
cord_19_covid_fda_journal_flag = cord_19_covid_fda_journal[["cord_uid", "preprint_indicator"]]

## Redoing Visualizations

### Preprocessing for Generic Over Time

In [None]:
drug_date_df_no_hydroxy_preprint = drug_date_df_no_hydroxy.merge(cord_19_covid_fda_journal_flag,
                                                                how = "left")
date_count_preprint = drug_date_df_no_hydroxy_preprint.groupby(["date_date", "preprint_indicator"]).agg(func = "count")
date_count_preprint = date_count_preprint.reset_index()
date_count_preprint = date_count_preprint[date_count_preprint["date_date"] <= "2021-03-29"]
date_count_preprint

### Visualization

In [None]:
fig, ax = plt.subplots(figsize=(16, 6))
sns.set_style("ticks")
plot = sns.lineplot(data = date_count_preprint,
                    x = "date_date",
                    y = "drug",
                   hue = "preprint_indicator")
sns.despine(offset = 10)
ax.set(xlabel="Date", 
       ylabel="Count",
       title="Variations in Publications Mentioning Biologics in CORD-19 Over Time")
ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=4))
ax.xaxis.set_major_formatter(DateFormatter("%m/%y"))
ax.legend(labels=["Not a Preprint", "Preprint"])
#ax.figure.savefig("210403_drug_time_preprint.tiff")

### Preprocessing for Drug-Specific Over Time

In [None]:
drug_date_count_preprint = drug_date_df_no_hydroxy_preprint.groupby(["date_date", "preprint_indicator", "drug"]).agg(func = "count")
drug_date_count_preprint = drug_date_count_preprint.reset_index()
drug_date_count_preprint = drug_date_count_preprint[drug_date_count_preprint["date_date"] <= "2021-03-29"]
drug_date_count_preprint

### Visualization

In [None]:
sns.set_style("ticks")
plot = sns.FacetGrid(drug_date_count_preprint,
                    col = "drug",
                    hue = "preprint_indicator",
                    col_wrap = 5)
plot.map_dataframe(sns.lineplot, 
         x = "date_date", 
         y = "date").add_legend(title = "Type of Manuscript",
                                labels=["Not a Preprint", "Preprint"])
sns.despine(offset = 10)
axes = plot.axes.flatten()
for ax in axes:
    ax.set_xlabel("Date")
    ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=12))
    ax.xaxis.set_major_formatter(DateFormatter("%m/%y"))
    ax.tick_params(axis = "x",
                   labelsize = 10,
                   rotation = 45)
    ax.set_ylabel("Count")
plot.tight_layout()
plot.set_titles(col_template = "{col_name}")


#ax.set(xlabel="Date", 
#       ylabel="Count",
#       title="Variations in Publications Mentioning Biologics in CORD-19 Over Time")
#ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=4))
#ax.xaxis.set_major_formatter(DateFormatter("%m/%y"))
plot.savefig("210403_drug_facet_preprint.tiff")

## Repeat but for general COVID-19 Data 

### Generate Preprint List

In [None]:
journal_list_covid = cord_19_covid.journal.unique().tolist()
journal_list_covid = [str(x).lower() for x in journal_list_covid]
preprint_list_covid = [j for j in journal_list_covid if "rxiv" in j]

### Only keep those with a journal

In [None]:
cord_19_covid_journal = cord_19_covid[cord_19_covid.journal.notna()]
cord_19_covid_journal.shape #177,233

### Adding Flag

In [None]:
cord_19_covid_journal["journal_processed"] = cord_19_covid_journal.journal.str.lower()
cord_19_covid_journal["preprint_indicator"] = cord_19_covid_journal.journal_processed.isin(preprint_list_covid).astype(int)

### Preprocessing for Visualization

In [None]:
cord_19_covid_journal_flag = cord_19_covid_journal[["cord_uid", "publish_time", "preprint_indicator"]]

### Redoing Visualization

In [None]:
date_count_preprint_covid = cord_19_covid_journal_flag.groupby(["publish_time", "preprint_indicator"]).agg(func = "count")
date_count_preprint_covid = date_count_preprint_covid.reset_index()
date_count_preprint_covid = date_count_preprint_covid[date_count_preprint_covid["publish_time"] <= "2021-03-29"]
date_count_preprint_covid = date_count_preprint_covid[(date_count_preprint_covid.publish_time != "2019") & (date_count_preprint_covid.publish_time != "2020") & (date_count_preprint_covid.publish_time != "2021")]
date_count_preprint_covid["date_date"] = pd.to_datetime(date_count_preprint_covid["publish_time"], format = "%Y-%m-%d")

fig, ax = plt.subplots(figsize=(16, 6))
sns.set_style("ticks")
plot = sns.lineplot(data = date_count_preprint_covid,
                    x = "date_date",
                    y = "cord_uid",
                   hue = "preprint_indicator")
sns.despine(offset = 10)
ax.set(xlabel="Date", 
       ylabel="Count",
       title="Variations in Publications Mentioning COVID-19 in CORD-19 Over Time")
ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=4))
ax.xaxis.set_major_formatter(DateFormatter("%m/%y"))
ax.legend(labels=["Not a Preprint", "Preprint"])
ax.figure.savefig("210415_covid_time_preprint.tiff")