In [96]:
import numpy as np
import pandas as pd
import scipy.stats

In [97]:
# I experimented with different encodings, this is what worked
df = pd.read_csv("TH_Cleaning.csv", encoding='latin1')

# rename this very unwieldy column name....
df.rename(inplace=True, columns={'COST (£) charged to Wellcome (inc VAT when charged)':'Cost'})

# uncomment the print to see where NaN's occur
# strip leading/trailing whitespace, make case consistent
for column in df.columns:
    df[column] = df[column].str.strip()
    df[column] = df[column].str.lower()
    #print(pd.value_counts(df[column].isna()))

# this drops about 10% of the data (roughly 200 rows), all of it in the PMID column
df.dropna(inplace=True)

# strip various nasties from the cost, as well as random \n's that were showing up
df['Cost'] = df['Cost'].str.replace('[£$]*', "")
df['Cost'] = df['Cost'].str.replace('[$£]?[.][0-9]{0,2}[$£]*', "")
df['PMID/PMCID'] = df['PMID/PMCID'].str.replace('\s*', "")

# convert to integers, then winsorize; there were values such as 99999 and some < 100
df.Cost = pd.to_numeric(df.Cost, errors='coerce').fillna(0).astype(np.int64)
scipy.stats.mstats.winsorize(df.Cost, limits=0.03, inplace=True)

# this was to get an idea of the distribution of costs, and where outliers might be
#values, counts = np.unique(df['Cost'], return_counts=True)

#df.groupby('Journal title')['Cost'].describe()

# grab the top 5 journals
popular = list(pd.value_counts(df['Journal title']).head(5).index)
#df.loc[df['Journal title'] == popular[0], :]

# calculate and display the mean, median, and std for the 5 most popular journals
for idx, title in enumerate(popular):
    stats = df.loc[df['Journal title'] == popular[idx], 'Cost']
    print(f"{popular[idx].title()} mean: £{stats.mean():.0f}")
    print(f"{popular[idx].title()} median: £{stats.median():.0f}")
    print(f"{popular[idx].title()} std: £{stats.std(ddof=1):.0f}\n") 

Plos One mean: £1079
Plos One median: £900
Plos One std: £656

Journal Of Biological Chemistry mean: £1484
Journal Of Biological Chemistry median: £1328
Journal Of Biological Chemistry std: £538

Neuroimage mean: £2230
Neuroimage median: £2334
Neuroimage std: £258

Nucleic Acids Research mean: £1161
Nucleic Acids Research median: £852
Nucleic Acids Research std: £448

Plos Pathogens mean: £1781
Plos Pathogens median: £1600
Plos Pathogens std: £723

