# working with strings in pandas

In [4]:
# import libraries 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

# warning
import warnings
warnings.filterwarnings('ignore')

# reading datasets

happiness_2015 = pd.read_csv("World_Happiness_2015.csv")
world_dev = pd.read_csv("World_dev.csv")

# choose important columns 
happiness_2015 = happiness_2015.loc[:, ["Country" , "Happiness Rank" , "Happiness Score"]]

world_dev = world_dev.loc[:, ["ShortName" , "Region" , "IncomeGroup" , "CurrencyUnit" ,
                  "SourceOfMostRecentIncomeAndExpenditureData" , "SpecialNotes"]]


# combine the two datasets together 
merged = pd.merge(happiness_2015 , world_dev, how = "left" , left_on="Country" , right_on="ShortName")

# rename columns
maping = {"SourceOfMostRecentIncomeAndExpenditureData" : "IESurvey"}

merged.rename(maping , axis = 1, inplace = True)

merged.tail(7) 

Unnamed: 0,Country,Happiness Rank,Happiness Score,ShortName,Region,IncomeGroup,CurrencyUnit,IESurvey,SpecialNotes
151,Burkina Faso,152,3.587,Burkina Faso,Sub-Saharan Africa,Low income,West African CFA franc,Core Welfare Indicator Questionnaire Survey (C...,
152,Afghanistan,153,3.575,Afghanistan,South Asia,Low income,Afghan afghani,"Integrated household survey (IHS), 2008",Fiscal year end: March 20; reporting period fo...
153,Rwanda,154,3.465,Rwanda,Sub-Saharan Africa,Low income,Rwandan franc,"Integrated household survey (IHS), 2010/11","Based on official government statistics, natio..."
154,Benin,155,3.34,Benin,Sub-Saharan Africa,Low income,West African CFA franc,Core Welfare Indicator Questionnaire Survey (C...,
155,Syria,156,3.006,,,,,,
156,Burundi,157,2.905,Burundi,Sub-Saharan Africa,Low income,Burundi franc,Core Welfare Indicator Questionnaire Survey (C...,
157,Togo,158,2.839,Togo,Sub-Saharan Africa,Low income,West African CFA franc,Core Welfare Indicator Questionnaire Survey (C...,"April 2013 database update: Based on IMF data,..."


In [5]:
#1
# using Apply to transform strings 
merged["CurrencyUnit"]

# extract the name of the currency only 
words = "Egyptian Pound"
words.split()[-1]

def extract_curr(element):
    
    words = str(element).split()     # convert to string 
    return words[-1]

merged["Currency Apply"] = merged["CurrencyUnit"].apply(extract_curr)

merged.head()

Unnamed: 0,Country,Happiness Rank,Happiness Score,ShortName,Region,IncomeGroup,CurrencyUnit,IESurvey,SpecialNotes,Currency Apply
0,Switzerland,1,7.587,Switzerland,Europe & Central Asia,High income: OECD,Swiss franc,"Expenditure survey/budget survey (ES/BS), 2004",,franc
1,Iceland,2,7.561,Iceland,Europe & Central Asia,High income: OECD,Iceland krona,"Integrated household survey (IHS), 2010",,krona
2,Denmark,3,7.527,Denmark,Europe & Central Asia,High income: OECD,Danish krone,"Income tax registers (ITR), 2010",,krone
3,Norway,4,7.522,Norway,Europe & Central Asia,High income: OECD,Norwegian krone,"Income survey (IS), 2010",,krone
4,Canada,5,7.427,Canada,North America,High income: OECD,Canadian dollar,"Labor force survey (LFS), 2010",Fiscal year end: March 31; reporting period fo...,dollar


In [12]:
#2-3
# another way to split strings 
# use victorized string methods 

merged["CurrencyUnit"].str.split()

merged["CurrencyUnit"].str[:5]

# chaining methods together 
merged["CurrencyUnit"].str.upper().str.split()
merged["CurrencyUnit"].str.upper().str.split()

# split the currency olumn
merged["Currency Vectorized"] = merged["CurrencyUnit"].str.split().str.get(-1)

merged.head()

merged.loc[:, ["Currency Apply" , "Currency Vectorized"]]

Unnamed: 0,Currency Apply,Currency Vectorized
0,franc,franc
1,krona,krona
2,krone,krone
3,krone,krone
4,dollar,dollar
...,...,...
153,franc,franc
154,franc,franc
155,,
156,franc,franc


In [20]:
#4- exploring missing values 

# checking missing values 
merged["CurrencyUnit"].isnull().sum()

# calculate lenght of strimg
def str_length(element):
    return len(str(element))

length_apply = merged["CurrencyUnit"].apply(str_length)

length_apply.value_counts()

# check the function return length of 3 for NaN
str_length("NaN")


# modify the function to exclude the null values 
def compute_length(element):
    if pd.isnull(element):
        pass
    else:
        return len(str(element))
    
A = merged["CurrencyUnit"].apply(compute_length)

A.value_counts(dropna = False)

# instructions
lengths = merged["CurrencyUnit"].str.len()
lengths.value_counts(dropna = False)

# and this means that the Series.str.len() method icludes NaN values

14.0    21
4.0     20
12.0    17
13.0    14
NaN     13
15.0    13
16.0    12
17.0     9
18.0     9
11.0     8
22.0     7
25.0     5
19.0     3
9.0      2
20.0     1
23.0     1
10.0     1
26.0     1
39.0     1
Name: CurrencyUnit, dtype: int64

In [21]:
# 5- find specific word in strings 
merged["SpecialNotes"].iloc[153] # what about the word "national accounts"

# start regex in python now 
# put your pattern first to search for it 
pattern = r"[Nn]ational accounts"

# search for your pattern 
national_accounts = merged["SpecialNotes"].str.contains(pattern)
national_accounts.value_counts(dropna = False)

# 6 continue
# extract the rows which contain the pattern 

# merged[national_accounts] it does not work so convert NaN to False
national_accounts = merged["SpecialNotes"].str.contains(pattern , na = False)
national_accounts.value_counts()

merged_national_accounts = merged[national_accounts]

merged_national_accounts.head()

Unnamed: 0,Country,Happiness Rank,Happiness Score,ShortName,Region,IncomeGroup,CurrencyUnit,IESurvey,SpecialNotes,Currency Apply,Currency Vectorized
4,Canada,5,7.427,Canada,North America,High income: OECD,Canadian dollar,"Labor force survey (LFS), 2010",Fiscal year end: March 31; reporting period fo...,dollar,dollar
7,Sweden,8,7.364,Sweden,Europe & Central Asia,High income: OECD,Swedish krona,"Income survey (IS), 2005",Fiscal year end: June 30; reporting period for...,krona,krona
8,New Zealand,9,7.286,New Zealand,East Asia & Pacific,High income: OECD,New Zealand dollar,,Fiscal year end: March 31; reporting period fo...,dollar,dollar
9,Australia,10,7.284,Australia,East Asia & Pacific,High income: OECD,Australian dollar,"Expenditure survey/budget survey (ES/BS), 2003",Fiscal year end: June 30; reporting period for...,dollar,dollar
14,United States,15,7.119,United States,North America,High income: OECD,U.S. dollar,"Labor force survey (LFS), 2010",Fiscal year end: September 30; reporting perio...,dollar,dollar


In [27]:
# 7 extract substring from a series 
# some regular expressions 
p = r"[0-9]" # range of numbers
p = r"[a-z]" # range of lower case
p = r"[A-Z]" # range of upper case

pattern = r"[1-6][a-z][a-z]"

# repeat pattern   r"[1-6][a-z][a-z][a-z]" = r"[1-6][a-z]{3}"

pattern = r"([1-2][0-9]{3})" # capture groups
# extract years from SpecialNotes
years = merged["SpecialNotes"].str.extract(pattern)
# check the rows contining years in specialnotes column
years

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
153,2006
154,
155,
156,


In [176]:
# 8- set the expand = True to get dataframe
pattern = r"([-2][0-9]{3})"
years = merged["SpecialNotes"].str.extract(pattern, expand =   True) # default is True now
years


Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
153,2006
154,
155,
156,


In [31]:
# 9- extract all matches of a patter from a series 

# change the index to understand this point 
merged = merged.set_index("Country")

# using names capturing group 
pattern = r"(?P<years>[1-2][0-9]{3})"
# extract all the match patternc in column 
merged["SpecialNotes"].str.extractall(pattern)

Unnamed: 0_level_0,Unnamed: 1_level_0,years
Country,match,Unnamed: 2_level_1
Finland,0,1999
Finland,1,1999
Netherlands,0,1999
Netherlands,1,2037
Netherlands,2,1999
...,...,...
Rwanda,1,2011
Rwanda,2,2008
Togo,0,2013
Togo,1,2000


In [33]:
#9- continued with the column that contains different formats for years 2005/2006

pattern = r"(?P<years>[1-2][0-9]{3})"
years = merged["IESurvey"].str.extractall(pattern)
years.value_counts() # 


years
2012     33
2010     28
2011     22
2013     12
2009      8
2008      6
2005      6
2007      4
2004      3
1995      1
2006      1
2003      1
2002      1
2000      1
1999      1
1998      1
1992      1
dtype: int64

we could noe extract all the years 2018/19

In [44]:
#10 - extract more than one group of a pattern from a series
pattern = r"(?P<First_year>[1-2][0-9]{3})/?(?P<second_year>[0-9]{2})"

years = merged["IESurvey"].str.extractall(pattern)
years

Unnamed: 0_level_0,Unnamed: 1_level_0,First_year,second_year
Country,match,Unnamed: 2_level_1,Unnamed: 3_level_1
Nigeria,0,2009,10
Azerbaijan,0,2011,12
Pakistan,0,2010,11
Mozambique,0,2008,9
Albania,0,2011,12
Swaziland,0,2009,10
South Africa,0,2010,11
Zimbabwe,0,2011,12
India,0,2011,12
Nepal,0,2010,11


In [60]:
# instruction 
# take the first 2 numbers from first year and add them to  second year 
pattern = r"([1-2][0-9])"

first_two_year = years["First_year"].str.extract(pattern)

years["secon_year"] = first_two_year[0] + years["second_year"]

years ; 

Unnamed: 0_level_0,Unnamed: 1_level_0,First_year,second_year,secon_year
Country,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Nigeria,0,2009,10,2010
Azerbaijan,0,2011,12,2012
Pakistan,0,2010,11,2011
Mozambique,0,2008,9,2009
Albania,0,2011,12,2012
Swaziland,0,2009,10,2010
South Africa,0,2010,11,2011
Zimbabwe,0,2011,12,2012
India,0,2011,12,2012
Nepal,0,2010,11,2011


Country       match
Nigeria       0        20
Azerbaijan    0        20
Pakistan      0        20
Mozambique    0        20
Albania       0        20
Swaziland     0        20
South Africa  0        20
Zimbabwe      0        20
India         0        20
Nepal         0        20
Ethiopia      0        20
Kenya         0        20
Botswana      0        20
Malawi        0        20
Angola        0        20
Mali          0        20
Uganda        0        20
Tanzania      0        20
Rwanda        0        20
Benin         0        20
Name: First_year, dtype: object