In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import seaborn as sns
import re
import csv
from scipy import stats, integrate
from scipy.stats import spearmanr
%matplotlib inline

In [2]:
# Here is the "dirty" data file that I need to clean
dirty = pd.read_csv('DataCleaning.csv', encoding = "ISO-8859-1")

In [4]:
# I made a blank data frame that was as long as the dirty data set and then transferred over the columns that were
# applicable to my assignment. 
data = pd.DataFrame(index=range(len(dirty)))
data['Publisher'] = dirty['Publisher']
data['Journal'] = dirty['Journal title']
data['Articles']= dirty['Article title']
data['cost_per_article'] = dirty['COST (£) charged to Wellcome (inc VAT when charged)']

In [5]:
# I dropped the null values in my data set so that I could run functions through it. 
data = data.dropna()

In [56]:
# I am making a dictionary with the acronyms of each journal and it's corresponding full name. 
def journal_acronym(series):
    # This is the empty dictionary
    dictionary = {}
    # Iterating through each row in the series.
    for item in series:
        # If the row is less than 3 words long then I skip this step
        if len(item.split()) > 2:
            # Split each row into words
            separate = item.split()
            # This is an empty key
            key = ''
            for word in separate:
                # Skip over these words
                if word.lower() != 'of' and word.lower() != 'and' and word.lower() != 'the' and word.lower() != '&':
                    # Now take the first letter of each word, capitalize it and concate it to the 'key'. 
                    key = key+word[0].upper()
            # Now make that the 'key' of the item that you iterated through and enter it into the dictionary. 
            dictionary[key] = item
    # return the dictionary with the key:value items from the series       
    return dictionary

In [57]:
journal_dict = journal_acronym(data['Journal'])

In [58]:
def journal_replace(series):
    # Iterating through each row in the series.
    for item in series:
        # If the row is less than 3 words long then I skip this step
        if len(item.split()) > 2:
            # Split each row into words
            separate = item.split()
            key = ''
            for word in separate:
                # Skip over these words
                if word.lower() != 'of' and word.lower() != 'and' and word.lower() != 'the' and word.lower() != '&':
                    # Now take the first letter of each word, capitalize it and concate it to the 'key'. 
                    key = key+word[0].upper()
            # If that key is in the previously made journal dictionary
            if key in journal_dict:
                # Then replace the current item that is being iterated through with the value from that dictionary
                series.replace(item, journal_dict[key], inplace = True)
    # return the modified series
    return series

In [59]:
# Now I'm replacing the current data['Journal'] column with the modified journals from the journal_replace function
data['Journal'] = journal_replace(data['Journal'])

In [12]:
# Before I answer this challenge questions, I'm going to strip the data['cost_per_article'] column of £ and $ and 
# change it into a float variable. 
data['cost_per_article'].update(data['cost_per_article'].apply(lambda x: float(str(x).strip('£ | $'))))

Determine the five most common journals and the total articles for each. Next, calculate the mean, median, and standard deviation of the open-access cost per article for each journal . You will need to do considerable data cleaning in order to extract accurate estimates. For a real bonus round, identify the open access prices paid by subject area.

In [41]:
# Here I've grouped by the data['Journal'] column and have it only counting the number of 'Articles' and then I sort
# my values by descending order and limit it to the top five values. 
data.groupby(data['Journal'])['Articles'].count().sort_values(ascending = False)[:5]

Journal
PLoS One                     92
Jnl Biological Chemistry     71
PLoS ONE                     62
JOURNAL OF NEUROCHEMISTRY    34
Nucleic Acids Research\n     29
Name: Articles, dtype: int64

In [63]:
# Now I isolated all the columns where the Journal was for PLoS One and then printed out the mean, median, and 
# standard deviation of its cost_per_article. 
PLoSOne = data.loc[data.Journal=='PLoS One', :]
print(PLoSOne['cost_per_article'].mean())
print(PLoSOne['cost_per_article'].median())
print(PLoSOne['cost_per_article'].std())

24732.8433696
894.22
147540.721101


In [64]:
# Now I isolated all the columns where the Journal was for Jnl Biological Chemistry and then printed out the mean, 
# median, and standard deviation of its cost_per_article.
JnlBiologicalChemistry = data.loc[data.Journal=='Jnl Biological Chemistry', :]
print(JnlBiologicalChemistry['cost_per_article'].mean())
print(JnlBiologicalChemistry['cost_per_article'].median())
print(JnlBiologicalChemistry['cost_per_article'].std())

29515.7440845
1324.57
166402.487895


In [65]:
# Now I isolated all the columns where the Journal was for PLoS ONE and then printed out the mean, median, and 
# standard deviation of its cost_per_article. 
PLoSONE = data.loc[data.Journal=='PLoS ONE', :]
print(PLoSONE['cost_per_article'].mean())
print(PLoSONE['cost_per_article'].median())
print(PLoSONE['cost_per_article'].std())

49248.7172581
890.095
216138.48622


In [66]:
# Now I isolated all the columns where the Journal was for JOURNAL OF NEUROCHEMISTRY and then printed out the mean, 
# median, and standard deviation of its cost_per_article. 
JOURNALofNEUROCHEMISTRY = data.loc[data.Journal=='JOURNAL OF NEUROCHEMISTRY', :]
print(JOURNALofNEUROCHEMISTRY['cost_per_article'].mean())
print(JOURNALofNEUROCHEMISTRY['cost_per_article'].median())
print(JOURNALofNEUROCHEMISTRY['cost_per_article'].std())

31202.0941176
1909.19
171182.810111


In [67]:
# Now I isolated all the columns where the Journal was for Nucleic Acids Research\n and then printed out the mean, 
# median, and standard deviation of its cost_per_article. 
NucleicAcidsResearch = data.loc[data.Journal=='Nucleic Acids Research\n', :]
print(NucleicAcidsResearch['cost_per_article'].mean())
print(NucleicAcidsResearch['cost_per_article'].median())
print(NucleicAcidsResearch['cost_per_article'].std())

1162.34482759
852.0
442.150933818
