2 Data preparation and cleaning
======
***

2.1 Wikipedia data preparation and cleaning
------

In [152]:
import pandas as pd 
pd.set_option('display.max_rows', 100)

In [153]:
# Read data from file 
data = pd.read_csv("output_List_of_explorers_t0.tsv",sep='\t') 
# Preview the first 5 lines of the loaded data 
print(data.head())
print(data.shape)


                              Name Modern (and former) nationality Centuries  \
0                 Antonio de Abreu                      Portuguese      16th   
1                    William Adams                         English      17th   
2                     Diogo Afonso                      Portuguese      15th   
3  Sir Crispin Agnew of Lochnaw Bt                        Scottish      20th   
4                  Charles Albanel                          French      17th   

                                 Main areas explored  
0                                          Indonesia  
1                                              Japan  
2                            Northwest African coast  
3  Greenland, Elephant Island, Northern Patagonia...  
4                                             Canada  
(596, 4)


In [154]:
data.drop_duplicates()
data.shape

(596, 4)

In [155]:
print(data['Centuries'].value_counts(ascending=False))


19th            132
16th             87
20th             73
18th             55
15th             49
17th             47
19th/20th        31
15th/16th        28
18th/19th        14
20th/21st        13
16th/17th        10
21st              9
13th              6
9th               5
14th              5
10th              4
6th BC            4
11th              3
8th               3
4th BC            3
13th/14th         2
2nd BC            2
1st BC            2
17th/18th         2
21st/20th BC      1
6th               1
12th              1
15th BC           1
7th               1
3rd BC            1
5th BC            1
Name: Centuries, dtype: int64


In [156]:

natCountF = data['Modern (and former) nationality'].value_counts(ascending=False).rename_axis('nationality').reset_index(name='counts')
print(natCountF.head())
print(natCountF.shape)
#print(natCountF)

  nationality  counts
0  Portuguese     100
1     English      69
2     Spanish      57
3    American      51
4     Russian      36
(95, 2)


In [157]:
# rename columns
data.columns = ['name','nationality','century','area']
print(data.head())

                              name nationality century  \
0                 Antonio de Abreu  Portuguese    16th   
1                    William Adams     English    17th   
2                     Diogo Afonso  Portuguese    15th   
3  Sir Crispin Agnew of Lochnaw Bt    Scottish    20th   
4                  Charles Albanel      French    17th   

                                                area  
0                                          Indonesia  
1                                              Japan  
2                            Northwest African coast  
3  Greenland, Elephant Island, Northern Patagonia...  
4                                             Canada  


In [158]:
from pandas import Series
# copy dataframe to cleandata
cleandata = data.copy()
# split centuries by / to seperated rows 
century = cleandata['century'].str.split('/').apply(Series, 1).stack()
# line up with clean data index
century.index = century.index.droplevel(-1)
# set join column name
century.name = 'century'
print(century.head())

# remove original century column
del cleandata['century']
# join with century dataframe
cleandata = cleandata.join(century)
# reindex
cleandata=cleandata.reset_index(drop=True)
print(cleandata.head())
#print(cleandata)
print(cleandata.shape)

0    16th
1    17th
2    15th
3    20th
4    17th
Name: century, dtype: object
                              name nationality  \
0                 Antonio de Abreu  Portuguese   
1                    William Adams     English   
2                     Diogo Afonso  Portuguese   
3  Sir Crispin Agnew of Lochnaw Bt    Scottish   
4                  Charles Albanel      French   

                                                area century  
0                                          Indonesia    16th  
1                                              Japan    17th  
2                            Northwest African coast    15th  
3  Greenland, Elephant Island, Northern Patagonia...    20th  
4                                             Canada    17th  
(697, 4)


In [159]:
#print(cleandata)

In [160]:
# split nationalities by - to seperated rows, , also need to clean – , () and / [11]
nationality = cleandata['nationality'].str.split('-|–|,|/|and|\(|\)|\[11]| ').apply(Series, 1).stack()
# line up with clean data index
nationality.index = nationality.index.droplevel(-1)
# set join column name
nationality.name = 'nationality'
print(nationality.head())

# remove original nationality column
del cleandata['nationality']
# join with nationality dataframe
cleandata = cleandata.join(nationality)
# reindex
cleandata=cleandata.reset_index(drop=True)
print(cleandata.head())
print(cleandata.shape)

0    Portuguese
1       English
2    Portuguese
3      Scottish
4        French
Name: nationality, dtype: object
                              name  \
0                 Antonio de Abreu   
1                    William Adams   
2                     Diogo Afonso   
3  Sir Crispin Agnew of Lochnaw Bt   
4                  Charles Albanel   

                                                area century nationality  
0                                          Indonesia    16th  Portuguese  
1                                              Japan    17th     English  
2                            Northwest African coast    15th  Portuguese  
3  Greenland, Elephant Island, Northern Patagonia...    20th    Scottish  
4                                             Canada    17th      French  
(816, 4)


In [161]:
# trim empty space before and after content
cleandata['nationality'] = cleandata['nationality'].str.strip()
# remove roww with empty nationality
cleandata = cleandata[cleandata['nationality']!='']

# reindex
cleandata=cleandata.reset_index(drop=True)

In [164]:
#print(cleandata)
cleandata.drop_duplicates()
cleandata.shape

(787, 4)

In [163]:
cleandata.to_csv('{}.csv'.format("cleandata_explorers"),index = False, header=True)