Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [56]:
from datetime import date
import numpy as np
import os
import pandas as pd
import sys
ancestor = os.path.abspath(os.path.join(os.pardir, os.pardir))
if ancestor not in sys.path:
    sys.path.append(ancestor)
from Biblio_Reader import manager as mg
ddir = os.path.join(ancestor, "Biblio_Reader", "biblio_reader", "data")


def dfile(d):
    """
    Function to read a named csv file in ddir.
    
    Parameter
    ---------
    d : string
        filename
    
    Returns
    -------
    ddf : DataFrame
    """
    ddf = pd.read_csv(
        os.path.join(ddir, d),
        header=None,
        names=["CiteScore", "cumulative_count"]
        ).sort_values(
        by="cumulative_count",
        ascending=True
        ).reset_index(
        drop=True
        )
    ddf[" - diff = "] = pd.Series(
            [
                0,
                *list(ddf["cumulative_count"])[:-1]
            ]
        )
    ddf["count"] = ddf["cumulative_count"].subtract(
        ddf[" - diff = "]
    )
    return(ddf)


abide = dfile("Expanded_ABIDE_CDF_Jon.csv")
no_abide = dfile("Expanded_NON_ABIDE_CDF_Jon.csv")
hcp = dfile("HCP_Inv_Jon.csv")
no_hcp = dfile("HCP_No_Inv_Jon.csv")


def cdf_weighted(data, column, value):
    """
    Given a DataFrame, a column name, and a value in that column,
    returns a single weighted CiteScore value.
    
    Parameters
    ----------
    data : DataFrame
        needs "Journal" column
       
    column : string
        column to filter
       
    value : any dtype
        value in `column` to get value for
        
    Returns
    -------
    cdfw : float
        ∑CiteScore/n
    """
    j = mg.get_journal_attrs()
    l = [str(journal_name).lower() for journal_name in data.loc[data[column] == value]["Journal"]]
    citescores = [float(j[k]["CiteScore"]) for k in l if k in j and len(j[k]["CiteScore"])]
    return(float("%.2f" %round(np.mean(citescores), 2)))


def cdf_weighted_from_count(data):
    """
    Given a DataFrame with a CiteScore column and a count column
    returns a single weighted CiteScore value.
    
    Parameter
    ---------
    data : DataFrame
        
    Returns
    -------
    cdfw : float
        ∑CiteScore/n
    """
    return(float("%.2f" %round(
        np.sum(data["count"] * data["CiteScore"])/np.sum(data["count"]),
        2))
    )

In [65]:
td = {
    "ABIDE data use": abide,
    "Non-ABIDE data use": no_abide,
    "HCP contributors": hcp,
    "HCP non-contributors": no_hcp
    }

df = pd.DataFrame.from_dict(
    {k: cdf_weighted_from_count(td[k]) for k in td},
    orient = "index"
)

df.columns=["weighted"]

df

Unnamed: 0,weighted
Non-ABIDE data use,4.24
ABIDE data use,4.59
HCP contributors,6.84
HCP non-contributors,4.59


# Contributor:

In [3]:
cdf_weighted(existing, "Contributor", "Contributor")

5.45

# Non-contributor:

In [4]:
cdf_weighted(existing, "Contributor", "Not a Contributor")

4.96

# ABIDE data use:

In [5]:
cdf_weighted(withsets.loc[withsets.Sets.str.contains("ABIDE")], "Data Use", "Y")

4.82

# Non-ABIDE data use:

In [6]:
cdf_weighted(withsets.loc[~withsets.Sets.str.contains("ABIDE")], "Data Use", "Y")

4.6