# Data cleaning: All methods

In [114]:
#imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure

### (Not so) Brief look at the data.



In [115]:
book_bl = pd.read_csv("C:/Users/cirob/Desktop/UNI MASTER/Data Cleaning/BL-Flickr-Images-Book.csv")

book_bl.head(5) 

Unnamed: 0,Identifier,Edition Statement,Place of Publication,Date of Publication,Publisher,Title,Author,Contributors,Corporate Author,Corporate Contributors,Former owner,Engraver,Issuance type,Flickr URL,Shelfmarks
0,206,,London,1879 [1878],S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,A. A.,"FORBES, Walter.",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12641.b.30.
1,216,,London; Virtue & Yorston,1868,Virtue & Co.,All for Greed. [A novel. The dedication signed...,"A., A. A.","BLAZE DE BURY, Marie Pauline Rose - Baroness",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12626.cc.2.
2,218,,London,1869,"Bradbury, Evans & Co.",Love the Avenger. By the author of “All for Gr...,"A., A. A.","BLAZE DE BURY, Marie Pauline Rose - Baroness",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12625.dd.1.
3,472,,London,1851,James Darling,"Welsh Sketches, chiefly ecclesiastical, to the...","A., E. S.","Appleyard, Ernest Silvanus.",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 10369.bbb.15.
4,480,"A new edition, revised, etc.",London,1857,Wertheim & Macintosh,"[The World in which I live, and my place in it...","A., E. S.","BROOME, John Henry.",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 9007.d.28.


In [116]:
# shape of the data
print(book_bl.shape)

(8287, 15)


In [117]:
#data types of the data
print(book_bl.dtypes)

Identifier                  int64
Edition Statement          object
Place of Publication       object
Date of Publication        object
Publisher                  object
Title                      object
Author                     object
Contributors               object
Corporate Author          float64
Corporate Contributors    float64
Former owner               object
Engraver                  float64
Issuance type              object
Flickr URL                 object
Shelfmarks                 object
dtype: object


In [118]:
# select numeric columns
book_bl_numeric = book_bl.select_dtypes(include=[np.number])
numeric_cols = book_bl_numeric.columns.values
print(numeric_cols)

['Identifier' 'Corporate Author' 'Corporate Contributors' 'Engraver']


In [119]:
# select non numeric columns
book_bl_non_numeric = book_bl.select_dtypes(exclude=[np.number])
non_numeric_cols = book_bl_non_numeric.columns.values
print(non_numeric_cols)

['Edition Statement' 'Place of Publication' 'Date of Publication'
 'Publisher' 'Title' 'Author' 'Contributors' 'Former owner'
 'Issuance type' 'Flickr URL' 'Shelfmarks']


In [120]:
# getting the columns of the dataset 
book_bl_columns = list(book_bl.columns)
book_bl_columns

['Identifier',
 'Edition Statement',
 'Place of Publication',
 'Date of Publication',
 'Publisher',
 'Title',
 'Author',
 'Contributors',
 'Corporate Author',
 'Corporate Contributors',
 'Former owner',
 'Engraver',
 'Issuance type',
 'Flickr URL',
 'Shelfmarks']

### Convert the data to the same format

In [121]:
book_bl.loc[0:, 'Date of Publication'].head(10)

0    1879 [1878]
1           1868
2           1869
3           1851
4           1857
5           1875
6           1872
7            NaN
8           1676
9           1679
Name: Date of Publication, dtype: object

In [122]:
regex = r'^(\d{4})' # \d represents any digit, and {4} repeats this rule four times.
                    #The ^ character matches the start of a string, and the parentheses denote a capturing group, 
                    # which signals to Pandas that we want to extract that part of the regex


book_bl['Date of Publication'] = book_bl['Date of Publication'].str.extract(regex, expand=False)
book_bl['Date of Publication'].head(10)                

0    1879
1    1868
2    1869
3    1851
4    1857
5    1875
6    1872
7     NaN
8    1676
9    1679
Name: Date of Publication, dtype: object

In [123]:
#Convert it to float (Optional)
#book_bl['Date of Publication'] = pd.to_numeric(extr)
#book_bl['Date of Publication'].dtype

### Missing Values
There are a few ways to deal with it:

- Drop the column completely. If the column isn’t that important to your analysis, just drop it.
- Imputation — the process of replacing missing data with substituted values: For string type values, we can replace NaN values with “” or “None” or any string that can indicate to you that there isn’t any value in that entry. When the feature is a categorical variable, we may impute the missing data by the mode (the most frequent value).


In [124]:
# examining missing values
print("Missing values distribution: ")
print(book_bl.isnull().mean())
print("")

Missing values distribution: 
Identifier                0.000000
Edition Statement         0.906721
Place of Publication      0.000000
Date of Publication       0.117171
Publisher                 0.506215
Title                     0.000000
Author                    0.214553
Contributors              0.000000
Corporate Author          1.000000
Corporate Contributors    1.000000
Former owner              0.999879
Engraver                  1.000000
Issuance type             0.000000
Flickr URL                0.000000
Shelfmarks                0.000000
dtype: float64



In [125]:
# examining missing values (% of missing)
for col in book_bl.columns:
    pct_missing = np.mean(book_bl[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))

#perc_miss =  ('{} - {}%'.format(col, round(pct_missing*100)))   

Identifier - 0%
Edition Statement - 91%
Place of Publication - 0%
Date of Publication - 12%
Publisher - 51%
Title - 0%
Author - 21%
Contributors - 0%
Corporate Author - 100%
Corporate Contributors - 100%
Former owner - 100%
Engraver - 100%
Issuance type - 0%
Flickr URL - 0%
Shelfmarks - 0%


In [126]:
# drop cols with a lot of missing values.
to_drop = ['Edition Statement',
           'Publisher',
           'Corporate Author',
           'Corporate Contributors',
           'Former owner',
           'Engraver']

book_bl.drop(to_drop, inplace=True, axis=1)

#Check it
for col in book_bl.columns:
    pct_missing = np.mean(book_bl[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))

Identifier - 0%
Place of Publication - 0%
Date of Publication - 12%
Title - 0%
Author - 21%
Contributors - 0%
Issuance type - 0%
Flickr URL - 0%
Shelfmarks - 0%


In [127]:
to_impute = [ 'Date of Publication',
              'Author']

for column in to_impute:
    book_bl[column] = book_bl[column].fillna("")

#Check it
for col in book_bl.columns:
    pct_missing = np.mean(book_bl[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))         

Identifier - 0%
Place of Publication - 0%
Date of Publication - 0%
Title - 0%
Author - 0%
Contributors - 0%
Issuance type - 0%
Flickr URL - 0%
Shelfmarks - 0%


### Unique values of columns
Beyond potentially missing values, there could be corrupted values that you can run into once you perform analysis. Nonetheless, It might not be strategic to check the unique values of all the columns. 


In [128]:
book_bl['Place of Publication'].unique()

array(['London', 'London; Virtue & Yorston',
       'pp. 40. G. Bryan & Co: Oxford, 1898', ...,
       'pp. viii. 64. J. Debrett: London, 1789', 'G. Eld: London, 1608',
       'Newcastle upon Tyne'], dtype=object)

In [133]:
# More details on these values
incorrect_values = ['London; Virtue & Yorston', 'pp. 40. G. Bryan & Co: Oxford, 1898', 'G. Eld: London, 1608', 'pp. viii. 64. J. Debrett: London, 1789']

for i in range(len(book_bl)):
    if book_bl['Place of Publication'].iloc[i] in incorrect_values:
        print(book_bl.iloc[i])
        print("")

Identifier                                                            216
Place of Publication                             London; Virtue & Yorston
Date of Publication                                                  1868
Title                   All for Greed. [A novel. The dedication signed...
Author                                                          A., A. A.
Contributors                 BLAZE DE BURY, Marie Pauline Rose - Baroness
Issuance type                                                 monographic
Flickr URL              http://www.flickr.com/photos/britishlibrary/ta...
Shelfmarks                              British Library HMNTS 12626.cc.2.
Name: 1, dtype: object

Identifier                                                            667
Place of Publication                  pp. 40. G. Bryan & Co: Oxford, 1898
Date of Publication                                                      
Title                   The Coming of Spring, and other poems. By J. A...
Author        

In [129]:
book_bl['Issuance type'].unique()

array(['monographic', 'continuing'], dtype=object)