In [85]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display as dsp
from difflib import SequenceMatcher
from scipy.stats import mode #Returns the value of a set that occurs the most
#%matplotlib inline

#latin1, latin9, or win1252.
spend = pd.read_csv('data/WELLCOME_APCspend2013_forThinkful.csv', encoding = 'latin1')
spend.rename(index = str, columns = {"COST (£) charged to Wellcome (inc VAT when charged)" : "Cost"})
dsp(spend.head())

#Starting with publishers. Strip whitespace and make all lowercase.
spend['Publisher'] = spend['Publisher'].str.lower().str.strip()

#This is an attempt to make my life easier
def replace_matches(df, column, accuracy, printme):
    entries = df[column].unique()
    i = 0
    for entry in entries:
        i += 1
        
        for comp in entries:
            if SequenceMatcher(None, entry, comp).ratio() > accuracy and entry != comp: #biomed was replaced with pubmed
                
                if printme == True:
                    print('{} REPLACED WITH {}'.format(entry, comp))
                df[df[column] == entry] = comp
            
def return_acronym(my_string):
    words = my_string.split()
    letters = [word[0] for word in words]
    return "".join(letters)

def expand_acronyms(df, column, accuracy, printme):
    entries = df[column].unique()
    for entry in entries:
        if len(entry) < 10:
            for comp in entries:
                if SequenceMatcher(None, entry, return_acronym(comp)).ratio() > accuracy and entry != comp:
                    if printme == True:
                        print('{} REPLACED WITH {}'.format(entry, comp))
                    df[df[column] == entry] = comp
                
#I will either have to play with these numbers or implement manual double checking.
replace_matches(spend, 'Publisher', 0.81, False)
expand_acronyms(spend, 'Publisher', 0.85, False)
print('Clean-up complete!')

#Stat details:

print('Publisher with most published works:\n', mode(spend['Publisher']))

#Sanity check
#print('\n\n', spend['Publisher'].unique())

#"Elsevier" appears to be the most prolific publisher with 399 published works.


Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


Clean-up complete!
Publisher with most published works:
 ModeResult(mode=array(['elsevier'], dtype=object), count=array([399]))




In [87]:
#Deal with arrays :)
costs = spend['COST (£) charged to Wellcome (inc VAT when charged)']

#Extract elements of a string that can be turned into a float
def get_float(str):
    the_float = ''
    point_count = 0
    try:
        for ch in str:
            if ch.isdigit():
                the_float += ch
            if ch == '.' and point_count == 0: #Only include first instance of decimal, I want to change this later
                the_float += ch
                point_count += 1
        return float(the_float)
    except:
        print('This caused problems: ', str)
        
#print('Function test:')
#print(get_float('1.234.50')) #Make this return 1234.50

costs = costs.apply(lambda x: get_float(x))
print(costs)

This caused problems:  cambridge uni press
This caused problems:  american chemical society
This caused problems:  american chemical society
This caused problems:  american chemical society
This caused problems:  american chemical society
This caused problems:  american chemical society
This caused problems:  american chemical society
This caused problems:  american chemical society
This caused problems:  acs (amercian chemical society) publications
This caused problems:  acs (amercian chemical society) publications
This caused problems:  acs (amercian chemical society) publications
This caused problems:  acs (amercian chemical society) publications
This caused problems:  acs (amercian chemical society) publications
This caused problems:  acs publications
This caused problems:  acs publications
This caused problems:  acs (amercian chemical society) publications
This caused problems:  american physiological society
This caused problems:  american physiological society
This caused proble