In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
#importing data from csv file
data = pd.read_csv("WELLCOME_APCspend2013_forThinkful.csv",encoding='iso-8859-13')

In [3]:
data.head(5)

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [4]:
#Renaming the columns for readability
data.rename( columns={'PMID/PMCID': 'ID',
                      'Journal title': 'Journal', 
                      'Article title':'Title',
                      'COST (£) charged to Wellcome (inc VAT when charged)':'Cost'}, inplace=True)

In [5]:
data.head(5)

Unnamed: 0,ID,Publisher,Journal,Title,Cost
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [6]:
#counting the no of journals
data.Journal.value_counts()

PLoS One                                                         92
PLoS ONE                                                         62
Journal of Biological Chemistry                                  48
Nucleic Acids Research                                           21
Proceedings of the National Academy of Sciences                  19
Human Molecular Genetics                                         18
PLoS Neglected Tropical Diseases                                 18
Nature Communications                                            17
PLoS Genetics                                                    15
PLoS Pathogens                                                   15
Neuroimage                                                       15
BMC Public Health                                                14
NeuroImage                                                       14
PLOS ONE                                                         14
Brain                                           

In [7]:
#sorting to see how data looks
data.Journal.sort_values()

21                                   ACS Chemical Biology
20                                   ACS Chemical Biology
8                                    ACS Chemical Biology
9                                    ACS Chemical Biology
19                                   ACS Chemical Biology
22                              ACS Chemical Neuroscience
23                                               ACS NANO
34                                               ACS Nano
924                                                ACTA F
1808                                                  AGE
2125                                                 AIDS
2123                                                 AIDS
2115                                                 AIDS
1797                                           AIDS Behav
1811                                            AIDS Care
1824                                            AIDS Care
2116                                         AIDS Journal
235           

In [8]:
#Capitalizing Journals and Removing white spaces in beginnings and end 
data.Journal = (data.Journal.str.capitalize()).str.strip()

In [9]:
#Replacing number by words, d followed by comma to colon and double white space to single white space
data.Journal = data.Journal.replace({'1':'one','d,':'d:','  ':' '},regex=True)

In [10]:
#Top 5 common Journals with total articles.
data.Journal.value_counts().head(5)

Plos one                           198
Journal of biological chemistry     53
Neuroimage                          29
Nucleic acids research              26
Plos genetics                       24
Name: Journal, dtype: int64

In [11]:
#remove pound and dollar signs from cost column
data.Cost = data.Cost.replace({'£': '', '\$': ''}, regex=True)

In [12]:
#convert cost column from string to float
data.Cost = pd.to_numeric(data.Cost,downcast='float')

In [13]:
data.head(5)

Unnamed: 0,ID,Publisher,Journal,Title,Cost
0,,CUP,Psychological medicine,Reduced parahippocampal cortical thickness in ...,0.0
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,2381.040039
2,23043264 PMC3506128,ACS,J med chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",642.559998
3,23438330 PMC3646402,ACS,J med chem,Orvinols with mixed kappa/mu opioid receptor a...,669.640015
4,23438216 PMC3601604,ACS,J org chem,Regioselective opening of myo-inositol orthoes...,685.880005


In [14]:
#Check the cost column for value discrepancies
data.Cost.sort_values(ascending = False).head(5)

1564    999999.0
996     999999.0
560     999999.0
1565    999999.0
1309    999999.0
Name: Cost, dtype: float32

In [15]:
#cost of 999999.00 doesnt make sense, looks like it was entered twice so set it to 999
data.Cost = data.apply(lambda x: 999 if x.Cost >= 999999 else x.Cost, axis=1)

In [16]:
#Again checking for incorrect values 
data.Cost.sort_values(ascending = False).head(5)

1987    201024.0
1470    192645.0
986      13200.0
1619      6000.0
800       5760.0
Name: Cost, dtype: float64

In [17]:
#Looks like anything above 10000 was error in decimal point so correct it iteratively
data.Cost = data.apply(lambda x: (x.Cost/10) if x.Cost >= 10000 else x.Cost, axis=1)

In [18]:
#Again checking for incorrect values
data.Cost.sort_values(ascending = False).head(5)

1987    20102.4
1470    19264.5
1619     6000.0
800      5760.0
798      4800.0
Name: Cost, dtype: float64

In [19]:
#Correcting the decimal points again
data.Cost = data.apply(lambda x: (x.Cost/10) if x.Cost >= 10000 else x.Cost, axis=1)

In [20]:
#Now the data looks believable
data.Cost.sort_values(ascending = False).head(5)

1619    6000.0
800     5760.0
552     4800.0
798     4800.0
648     4800.0
Name: Cost, dtype: float64

In [21]:
#Plos one mean, median and standard deviation
plosdata = data[data.Journal == 'Plos one']
print('Mean, Meadian and Standard Deviation for Journal "Plos one" are:'
      , plosdata.Cost.mean(),',',plosdata.Cost.median(),'and',plosdata.Cost.std())

Mean, Meadian and Standard Deviation for Journal "Plos one" are: 944.9030299485332 , 901.8099975585938 and 207.01745622056202


In [22]:
#Journal of biological chemistry mean, median and standard deviation
jobcdata = data[data.Journal == 'Journal of biological chemistry']
print('Mean, Meadian and Standard Deviation for Journal "Journal of biological chemistry" are:'
      , jobcdata.Cost.mean(),',',jobcdata.Cost.median(),'and',jobcdata.Cost.std())

Mean, Meadian and Standard Deviation for Journal "Journal of biological chemistry" are: 1415.5773539993 , 1287.75 and 412.1216238854899


In [23]:
#Neuroimage mean, median and standard deviation
neurodata = data[data.Journal == 'Neuroimage']
print('Mean, Meadian and Standard Deviation for Journal "Neuroimage" are:'
      , neurodata.Cost.mean(),',',neurodata.Cost.median(),'and',neurodata.Cost.std())

Mean, Meadian and Standard Deviation for Journal "Neuroimage" are: 2215.168280239763 , 2326.429931640625 and 266.6539583463401


In [24]:
#Nucleic acids research mean, median and standard deviation
nucleicdata = data[data.Journal == 'Nucleic acids research']
print('Mean, Meadian and Standard Deviation for Journal "Nucleic acids research" are:'
      , nucleicdata.Cost.mean(),',',nucleicdata.Cost.median(),'and',nucleicdata.Cost.std())

Mean, Meadian and Standard Deviation for Journal "Nucleic acids research" are: 1149.0 , 852.0 and 442.9404474644419


In [25]:
#Plos genetics mean, median and standard deviation
plosgendata = data[data.Journal == 'Plos genetics']
print('Mean, Meadian and Standard Deviation for Journal "Plos genetics" are:'
      , plosgendata.Cost.mean(),',',plosgendata.Cost.median(),'and',plosgendata.Cost.std())

Mean, Meadian and Standard Deviation for Journal "Plos genetics" are: 1589.4349975585938 , 1696.6699829101562 and 233.55106590828112
