# Pandas

In [3]:
import pandas as pd

In [5]:
df = pd.read_csv("gapminder.tsv", sep='\t')
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.449960
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623


In [4]:
df.shape

(1704, 6)

In [5]:
type(df)

pandas.core.frame.DataFrame

In [6]:
df.columns

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

In [7]:
df.dtypes # dtype for each column

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
dtype: object

In [8]:
df.info # shows each and every row and column

<bound method DataFrame.info of           country continent  year  lifeExp       pop   gdpPercap
0     Afghanistan      Asia  1952   28.801   8425333  779.445314
1     Afghanistan      Asia  1957   30.332   9240934  820.853030
2     Afghanistan      Asia  1962   31.997  10267083  853.100710
3     Afghanistan      Asia  1967   34.020  11537966  836.197138
4     Afghanistan      Asia  1972   36.088  13079460  739.981106
...           ...       ...   ...      ...       ...         ...
1699     Zimbabwe    Africa  1987   62.351   9216418  706.157306
1700     Zimbabwe    Africa  1992   60.377  10704340  693.420786
1701     Zimbabwe    Africa  1997   46.809  11404948  792.449960
1702     Zimbabwe    Africa  2002   39.989  11926563  672.038623
1703     Zimbabwe    Africa  2007   43.487  12311143  469.709298

[1704 rows x 6 columns]>

In [10]:
df.head() # shows top 5 if number not specified

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


## subsetting

### subsetting column by name

In [11]:
country_df = df['country']
country_df

0       Afghanistan
1       Afghanistan
2       Afghanistan
3       Afghanistan
4       Afghanistan
           ...     
1699       Zimbabwe
1700       Zimbabwe
1701       Zimbabwe
1702       Zimbabwe
1703       Zimbabwe
Name: country, Length: 1704, dtype: object

In [42]:
country_df.head() # DEFAULT VALUE IS 5

0    Afghanistan
1    Afghanistan
2    Afghanistan
3    Afghanistan
4    Afghanistan
Name: country, dtype: object

In [14]:
# country_df.head(n=1)
country_df.head(n=10) # country_df.head(10) same

0    Afghanistan
1    Afghanistan
2    Afghanistan
3    Afghanistan
4    Afghanistan
5    Afghanistan
6    Afghanistan
7    Afghanistan
8    Afghanistan
9    Afghanistan
Name: country, dtype: object

In [15]:
country_df.tail()

1699    Zimbabwe
1700    Zimbabwe
1701    Zimbabwe
1702    Zimbabwe
1703    Zimbabwe
Name: country, dtype: object

In [16]:
country_df.tail(10)

1694    Zimbabwe
1695    Zimbabwe
1696    Zimbabwe
1697    Zimbabwe
1698    Zimbabwe
1699    Zimbabwe
1700    Zimbabwe
1701    Zimbabwe
1702    Zimbabwe
1703    Zimbabwe
Name: country, dtype: object

In [17]:
country_df.unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
       'Australia', 'Austria', 'Bahrain', 'Bangladesh', 'Belgium',
       'Benin', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon',
       'Canada', 'Central African Republic', 'Chad', 'Chile', 'China',
       'Colombia', 'Comoros', 'Congo, Dem. Rep.', 'Congo, Rep.',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Czech Republic',
       'Denmark', 'Djibouti', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Ethiopia',
       'Finland', 'France', 'Gabon', 'Gambia', 'Germany', 'Ghana',
       'Greece', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Haiti',
       'Honduras', 'Hong Kong, China', 'Hungary', 'Iceland', 'India',
       'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy',
       'Jamaica', 'Japan', 'Jordan', 'Kenya', 'Korea, Dem. Rep.',
       'Korea, Rep.', 'Kuwait', 'Leba

If we want only a specific column from our data, we can access the data
using square brackets.

country_df = df['country'] # returns a dataframe

To specify multiple columns by the column name, we need to pass in a
Python list between the square brackets. This may look a bit strange since
there will be two sets of square brackets.

In [20]:
subset = df[['country', 'continent', 'year']]
print(subset.head())

       country continent  year
0  Afghanistan      Asia  1952
1  Afghanistan      Asia  1957
2  Afghanistan      Asia  1962
3  Afghanistan      Asia  1967
4  Afghanistan      Asia  1972


In [21]:
print(subset.tail())

       country continent  year
1699  Zimbabwe    Africa  1987
1700  Zimbabwe    Africa  1992
1701  Zimbabwe    Africa  1997
1702  Zimbabwe    Africa  2002
1703  Zimbabwe    Africa  2007


**deprecated**
Subsetting Columns by Index Position Break in Pandas i.e df[[1]], df[[0, -1]]

## Subsetting Rows

### loc - Subset based on index label (row name)
#### in this example the index label's type is integer so for loc[0] works, if the index label was strings then we can use only strings for loc[string]
### iloc - Subset based on row index (row number)

In [22]:
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [23]:
df.loc[0] # index starts from 0

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap     779.445314
Name: 0, dtype: object

In [24]:
df.loc[99]

country      Bangladesh
continent          Asia
year               1967
lifeExp          43.453
pop            62821884
gdpPercap    721.186086
Name: 99, dtype: object

In [26]:
df.loc[-1] # ERROR (ValueError: -1 is not in range) 
# BUT -1 WORKS IN iloc

KeyError: -1

**ALTERNATIVERLY**

In [39]:
df.loc[df.shape[0] - 1] # ALTERNATIVE FOR df.loc[-1]

country        Zimbabwe
continent        Africa
year               2007
lifeExp          43.487
pop            12311143
gdpPercap    469.709298
Name: 1703, dtype: object

In [43]:
df.tail(1) # BUT THIS results in a DATAFRAME and not SERIES

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1703,Zimbabwe,Africa,2007,43.487,12311143,469.709298


#### iloc

In [30]:
print(df.iloc[0], df.iloc[99], df.iloc[1703], sep='\n\n')

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap     779.445314
Name: 0, dtype: object

country      Bangladesh
continent          Asia
year               1967
lifeExp          43.453
pop            62821884
gdpPercap    721.186086
Name: 99, dtype: object

country        Zimbabwe
continent        Africa
year               2007
lifeExp          43.487
pop            12311143
gdpPercap    469.709298
Name: 1703, dtype: object


In [32]:
df.iloc[-1] # -1 WORKS IN iloc # or can do df.iloc[df.shape[0] - 1]

country        Zimbabwe
continent        Africa
year               2007
lifeExp          43.487
pop            12311143
gdpPercap    469.709298
Name: 1703, dtype: object

In [103]:
df.iloc[df.shape[0] - 1] # THIS WILL WORK ONLY HERE COZ the index label
# starts from 0... if the index label starts from 101... then this
# method won't work

country        Zimbabwe
continent        Africa
year               2007
lifeExp          43.487
pop            12311143
gdpPercap    469.709298
Name: 1703, dtype: object

In [37]:
# SERIES
s = df.loc[0]
print(type(s), s, sep='\n\n')

<class 'pandas.core.series.Series'>

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap     779.445314
Name: 0, dtype: object


In [38]:
# DATAFRAME
s1 = df.head(1)
print(type(s1))
s1

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314


### Subsetting Multiple Rows, works on both loc and iloc

In [44]:
df.loc[[0, 99, 999]]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
99,Bangladesh,Asia,1967,43.453,62821884,721.186086
999,Mongolia,Asia,1967,51.253,1149500,1226.04113


In [45]:
df.iloc[[0, 99, 999]]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
99,Bangladesh,Asia,1967,43.453,62821884,721.186086
999,Mongolia,Asia,1967,51.253,1149500,1226.04113


#### for one row it is Series and for more than one row it is DataFrame

In [47]:
print(df.loc[0], type(df.loc[0]), sep='\n\n', end='\n\n')
print(df.loc[[0, 99]], type(df.loc[[0, 99]]), sep='\n\n')

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap     779.445314
Name: 0, dtype: object

<class 'pandas.core.series.Series'>

        country continent  year  lifeExp       pop   gdpPercap
0   Afghanistan      Asia  1952   28.801   8425333  779.445314
99   Bangladesh      Asia  1967   43.453  62821884  721.186086

<class 'pandas.core.frame.DataFrame'>


In [49]:
# Series
# 1d
# only 1 data type

In [50]:
# DataFrame
# 2d
# combination of many series
# many data types

In [2]:
# data.ix[0] # used to be supported by pandas, but not anymore

# MAIN DIFFERENCE BETWEEN loc AND iloc

In [6]:
df.loc[1:5]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
5,Afghanistan,Asia,1977,38.438,14880372,786.11336


In [8]:
df.iloc[1:5]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


## loc range includes 5 and iloc range excludes 5

### Combining column and row subsetting

#### df.loc/iloc[rows, columns]

for **loc** we can use only column names and not column indices

for **iloc** we can use only column indices and not column names

In [60]:
subset = df.loc[:, ['year', 'pop']]
subset.head()

Unnamed: 0,year,pop
0,1952,8425333
1,1957,9240934
2,1962,10267083
3,1967,11537966
4,1972,13079460


In [59]:
s = df.iloc[:, [2, 4, -1]]
s.head()

Unnamed: 0,year,pop,gdpPercap
0,1952,8425333,779.445314
1,1957,9240934,820.85303
2,1962,10267083,853.10071
3,1967,11537966,836.197138
4,1972,13079460,739.981106


In [61]:
# we cannot use "column indices" with "loc"
subset = df.loc[:, [2, 4]]
subset

KeyError: "None of [Int64Index([2, 4], dtype='int64')] are in the [columns]"

In [63]:
# we cannot use "column names" with "iloc"
subset = df.iloc[:, ['year', 'pop']]
subset

IndexError: .iloc requires numeric indexers, got ['year' 'pop']

### Subsetting Columns by Range

In [64]:
r = list(range(5))
print(r)

[0, 1, 2, 3, 4]


In [65]:
s = df.iloc[:, r]
s

Unnamed: 0,country,continent,year,lifeExp,pop
0,Afghanistan,Asia,1952,28.801,8425333
1,Afghanistan,Asia,1957,30.332,9240934
2,Afghanistan,Asia,1962,31.997,10267083
3,Afghanistan,Asia,1967,34.020,11537966
4,Afghanistan,Asia,1972,36.088,13079460
...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418
1700,Zimbabwe,Africa,1992,60.377,10704340
1701,Zimbabwe,Africa,1997,46.809,11404948
1702,Zimbabwe,Africa,2002,39.989,11926563


In [66]:
r = list(range(3, 6))
r

[3, 4, 5]

In [73]:
s = df.iloc[:, r]
s.head()

Unnamed: 0,country,year,pop
0,Afghanistan,1952,8425333
1,Afghanistan,1957,9240934
2,Afghanistan,1962,10267083
3,Afghanistan,1967,11537966
4,Afghanistan,1972,13079460


In [69]:
r = list(range(0, 6, 2))
r

[0, 2, 4]

In [72]:
s = df.iloc[:, r]
s.head()

Unnamed: 0,country,year,pop
0,Afghanistan,1952,8425333
1,Afghanistan,1957,9240934
2,Afghanistan,1962,10267083
3,Afghanistan,1967,11537966
4,Afghanistan,1972,13079460


In [74]:
r = list(range(0, 10))
r

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [77]:
s = df.iloc[:, r] # r out of range will not work
s

IndexError: positional indexers are out-of-bounds

### Slicing Columns

In [79]:
subset = df.iloc[:, :3]
subset.head()

Unnamed: 0,country,continent,year
0,Afghanistan,Asia,1952
1,Afghanistan,Asia,1957
2,Afghanistan,Asia,1962
3,Afghanistan,Asia,1967
4,Afghanistan,Asia,1972


In [81]:
subset = df.iloc[:, 3:6]
subset.head()

Unnamed: 0,lifeExp,pop,gdpPercap
0,28.801,8425333,779.445314
1,30.332,9240934,820.85303
2,31.997,10267083,853.10071
3,34.02,11537966,836.197138
4,36.088,13079460,739.981106


In [83]:
subset = df.iloc[:, 0:6:2]
subset.head()

Unnamed: 0,country,year,pop
0,Afghanistan,1952,8425333
1,Afghanistan,1957,9240934
2,Afghanistan,1962,10267083
3,Afghanistan,1967,11537966
4,Afghanistan,1972,13079460


In [85]:
# df.iloc[:, 0:6:] # 6 columns
df.iloc[:, 0::2] # 3 columns
# df.iloc[:, :6:2] # 3 columns
# df.iloc[:, ::2] # 3 columns
# df.iloc[:, ::] # 6 columns

Unnamed: 0,country,year,pop
0,Afghanistan,1952,8425333
1,Afghanistan,1957,9240934
2,Afghanistan,1962,10267083
3,Afghanistan,1967,11537966
4,Afghanistan,1972,13079460
...,...,...,...
1699,Zimbabwe,1987,9216418
1700,Zimbabwe,1992,10704340
1701,Zimbabwe,1997,11404948
1702,Zimbabwe,2002,11926563


### Subsetting Rows and Columns

In [86]:
df.loc[42, 'country']

'Angola'

In [87]:
df.iloc[42, 0]

'Angola'

In [90]:
# df.loc[42, 0] # error, even, df.iloc[42, 'country']

### Subsetting Multiple Rows and Columns

In [91]:
df.iloc[[0, 99, 999], [0, 3, 5]]

Unnamed: 0,country,lifeExp,gdpPercap
0,Afghanistan,28.801,779.445314
99,Bangladesh,43.453,721.186086
999,Mongolia,51.253,1226.04113


In [92]:
df.loc[[0, 99, 999], ['country', 'lifeExp', 'gdpPercap']]

Unnamed: 0,country,lifeExp,gdpPercap
0,Afghanistan,28.801,779.445314
99,Bangladesh,43.453,721.186086
999,Mongolia,51.253,1226.04113


In [104]:
df.loc[10:13, ['country', 'lifeExp', 'gdpPercap']]
# SEE HERE, VVVVV IIIMMMMPPPP THING
# IT IS INCLUSIVE OF BOTH 10 and 13
# 13 is also included here
# ******************************************
# VVVVVV IIIMMMPPPP

Unnamed: 0,country,lifeExp,gdpPercap
10,Afghanistan,42.129,726.734055
11,Afghanistan,43.828,974.580338
12,Albania,55.23,1601.056136
13,Albania,59.28,1942.284244


## Series

In [95]:
s = pd.Series(['banana', 42])
print(s)
# key value pair just like dictionary

0    banana
1        42
dtype: object


In [105]:
s = pd.Series(['Mckinney', 'Creator of Pandas'], index=['Person', 'Who'])
s

Person             Mckinney
Who       Creator of Pandas
dtype: object

In [107]:
s.index

Index(['Person', 'Who'], dtype='object')

In [108]:
s.values

array(['Mckinney', 'Creator of Pandas'], dtype=object)

## DataFrame

In [109]:
df = pd.DataFrame({
    'Name': ['R Franklin', 'W Gosset'],
    'Occupation': ['Chemist', 'Statistician'],
    'Born': ['1920-7-25', '1876-6-13'],
    'Died': ['1958-4-16', '1937-10-16'],
    'Age': ['37', '61']
})
# print(df)
df

Unnamed: 0,Name,Occupation,Born,Died,Age
0,R Franklin,Chemist,1920-7-25,1958-4-16,37
1,W Gosset,Statistician,1876-6-13,1937-10-16,61


In [115]:
# df.set_index(pd.Index([...]))  # new indices
# df.set_index(["", "", ..]) df.set_index("") # from column/s
# inplace=True for inplace

In [116]:
# df.drop([columns], axis=1)

### Group by split apply and combine

In [126]:
df = pd.read_csv('./unit 5/weather_by_cities.csv')
df

Unnamed: 0,day,city,temperature,windspeed,event
0,1/1/2017,new york,32,6,Rain
1,2/1/2017,new york,36,7,Sunny
2,3/1/2017,new york,28,12,Snow
3,4/1/2017,new york,33,7,Sunny
4,1/1/2017,mumbai,90,5,Sunny
5,2/1/2017,mumbai,85,12,Fog
6,3/1/2017,mumbai,87,15,Fog
7,4/1/2017,mumbai,92,5,Rain
8,1/1/2017,paris,45,20,Sunny
9,2/1/2017,paris,50,13,Cloudy


In [127]:
g = df.groupby("city")
g

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000204FEB2F700>

In [129]:
for city, data in g:
    print("City: ", city, sep='\n', end='\n\n')
    print("Data: ", data, sep='\n', end='\n\n')

City: 
mumbai

Data: 
        day    city  temperature  windspeed  event
4  1/1/2017  mumbai           90          5  Sunny
5  2/1/2017  mumbai           85         12    Fog
6  3/1/2017  mumbai           87         15    Fog
7  4/1/2017  mumbai           92          5   Rain

City: 
new york

Data: 
        day      city  temperature  windspeed  event
0  1/1/2017  new york           32          6   Rain
1  2/1/2017  new york           36          7  Sunny
2  3/1/2017  new york           28         12   Snow
3  4/1/2017  new york           33          7  Sunny

City: 
paris

Data: 
         day   city  temperature  windspeed   event
8   1/1/2017  paris           45         20   Sunny
9   2/1/2017  paris           50         13  Cloudy
10  3/1/2017  paris           54          8  Cloudy
11  4/1/2017  paris           42         10  Cloudy



**This is similar to SQL,
SELECT * from weather_data GROUP BY city**

In [130]:
g.get_group('mumbai')

Unnamed: 0,day,city,temperature,windspeed,event
4,1/1/2017,mumbai,90,5,Sunny
5,2/1/2017,mumbai,85,12,Fog
6,3/1/2017,mumbai,87,15,Fog
7,4/1/2017,mumbai,92,5,Rain


In [131]:
g.max()

Unnamed: 0_level_0,day,temperature,windspeed,event
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mumbai,4/1/2017,92,15,Sunny
new york,4/1/2017,36,12,Sunny
paris,4/1/2017,54,20,Sunny


In [132]:
g.min()

Unnamed: 0_level_0,day,temperature,windspeed,event
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mumbai,1/1/2017,85,5,Fog
new york,1/1/2017,28,6,Rain
paris,1/1/2017,42,8,Cloudy


In [133]:
g.mean()

Unnamed: 0_level_0,temperature,windspeed
city,Unnamed: 1_level_1,Unnamed: 2_level_1
mumbai,88.5,9.25
new york,32.25,8.0
paris,47.75,12.75


This method of splitting your dataset in smaller groups and then applying an operation
(such as min or max) to get aggregate result is called Split-Apply-Combine. It is illustrated
in a diagram below

In [134]:
g.describe()

Unnamed: 0_level_0,temperature,temperature,temperature,temperature,temperature,temperature,temperature,temperature,windspeed,windspeed,windspeed,windspeed,windspeed,windspeed,windspeed,windspeed
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
mumbai,4.0,88.5,3.109126,85.0,86.5,88.5,90.5,92.0,4.0,9.25,5.057997,5.0,5.0,8.5,12.75,15.0
new york,4.0,32.25,3.304038,28.0,31.0,32.5,33.75,36.0,4.0,8.0,2.708013,6.0,6.75,7.0,8.25,12.0
paris,4.0,47.75,5.315073,42.0,44.25,47.5,51.0,54.0,4.0,12.75,5.251984,8.0,9.5,11.5,14.75,20.0


In [135]:
g.size()

city
mumbai      4
new york    4
paris       4
dtype: int64

In [136]:
g.count()

Unnamed: 0_level_0,day,temperature,windspeed,event
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mumbai,4,4,4,4
new york,4,4,4,4
paris,4,4,4,4


In [137]:
# handling missing data - diff notebook

### Merge and Join

The pd.merge() function implements a number of types of joins: the one-to-one,many-to-
one, and many-to-many joins.

All three types of joins are accessed via an identical call to the pd.merge()

### One-to-one joins

simplest type of merge expression is the one-to-one join, which is very similar to the
column-wise concatenation

In [138]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
'hire_date': [2004, 2008, 2012, 2014]})

In [139]:
df3 = pd.merge(df1, df2)
df3

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


### Many-to-one joins

Many-to-one joins are joins in which one of the two key columns contains duplicate entries.
For the many-to-one case, the resulting DataFrame will preserve those duplicate entries as
appropriate

In [142]:
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
'supervisor': ['Carly', 'Guido', 'Steve']})
df4

Unnamed: 0,group,supervisor
0,Accounting,Carly
1,Engineering,Guido
2,HR,Steve


**here df4 group col has all single values (so one) and df3 group col has repeated values in group col (so many) so many to one**

In [141]:
df3

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [143]:
pd.merge(df3, df4)

Unnamed: 0,employee,group,hire_date,supervisor
0,Bob,Accounting,2008,Carly
1,Jake,Engineering,2012,Guido
2,Lisa,Engineering,2004,Guido
3,Sue,HR,2014,Steve


### Many-to-many joins

If the key column in both the left and right array contains duplicates, then the result is a
many-to-many merge.
Consider the following, where we have a DataFrame showing one or more skills associated
with a particular group.

In [144]:
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting',
'Engineering', 'Engineering', 'HR', 'HR'],'skills': ['math',
'spreadsheets', 'coding', 'linux',
'spreadsheets', 'organization']})
df5

Unnamed: 0,group,skills
0,Accounting,math
1,Accounting,spreadsheets
2,Engineering,coding
3,Engineering,linux
4,HR,spreadsheets
5,HR,organization


**here df5 group col has all repeated values (so many) and df1 group col has repeated values in group col (so many) so many to many**

In [145]:
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


In [146]:
pd.merge(df1, df5)

Unnamed: 0,employee,group,skills
0,Bob,Accounting,math
1,Bob,Accounting,spreadsheets
2,Jake,Engineering,coding
3,Jake,Engineering,linux
4,Lisa,Engineering,coding
5,Lisa,Engineering,linux
6,Sue,HR,spreadsheets
7,Sue,HR,organization


### Basic Merge Using a Dataframe Column

In [147]:
df1 = pd.DataFrame({
"city": ["new york","chicago","orlando"],
"temperature": [21,14,35],
})
df1

Unnamed: 0,city,temperature
0,new york,21
1,chicago,14
2,orlando,35


In [148]:
df2 = pd.DataFrame({
"city": ["chicago","new york","orlando"],
"humidity": [65,68,75],
})
df2

Unnamed: 0,city,humidity
0,chicago,65
1,new york,68
2,orlando,75


In [149]:
df3 = pd.merge(df1, df2, on="city")
df3

Unnamed: 0,city,temperature,humidity
0,new york,21,68
1,chicago,14,65
2,orlando,35,75


### Type Of DataBase Joins

left, right, inner, outer(full join)

In [151]:
df1 = pd.DataFrame({
"city": ["new york","chicago","orlando", "baltimore"],
"temperature": [21,14,35, 38],
})
df1

Unnamed: 0,city,temperature
0,new york,21
1,chicago,14
2,orlando,35
3,baltimore,38


In [152]:
df2 = pd.DataFrame({
"city": ["chicago","new york","san diego"],
"humidity": [65,68,71],
})
df2

Unnamed: 0,city,humidity
0,chicago,65
1,new york,68
2,san diego,71


In [153]:
df3 = pd.merge(df1, df2, on="city", how="inner")
df3

Unnamed: 0,city,temperature,humidity
0,new york,21,68
1,chicago,14,65


In [154]:
df3 = pd.merge(df1, df2, on="city", how="outer")
df3

Unnamed: 0,city,temperature,humidity
0,new york,21.0,68.0
1,chicago,14.0,65.0
2,orlando,35.0,
3,baltimore,38.0,
4,san diego,,71.0


In [155]:
df3 = pd.merge(df1, df2, on="city", how="left")
df3

Unnamed: 0,city,temperature,humidity
0,new york,21,68.0
1,chicago,14,65.0
2,orlando,35,
3,baltimore,38,


In [156]:
df3 = pd.merge(df1, df2, on="city", how="right")
df3

Unnamed: 0,city,temperature,humidity
0,chicago,14.0,65
1,new york,21.0,68
2,san diego,,71


### indicator flag

In [157]:
df3 = pd.merge(df1, df2, on="city", how="outer", indicator=True)
df3

Unnamed: 0,city,temperature,humidity,_merge
0,new york,21.0,68.0,both
1,chicago,14.0,65.0,both
2,orlando,35.0,,left_only
3,baltimore,38.0,,left_only
4,san diego,,71.0,right_only


### suffixes

In [158]:
df1 = pd.DataFrame({
"city": ["new york","chicago","orlando", "baltimore"],
"temperature": [21,14,35, 38],
"humidity": [65,68,71, 75]
})
df1

Unnamed: 0,city,temperature,humidity
0,new york,21,65
1,chicago,14,68
2,orlando,35,71
3,baltimore,38,75


In [159]:
df2 = pd.DataFrame({
"city": ["chicago","new york","san diego"],
"temperature": [21,14,35],
"humidity": [65,68,71]
})
df2

Unnamed: 0,city,temperature,humidity
0,chicago,21,65
1,new york,14,68
2,san diego,35,71


In [161]:
df3 = pd.merge(df1, df2, on="city", how="outer", 
               suffixes=("_first", "_second"))
df3

Unnamed: 0,city,temperature_first,humidity_first,temperature_second,humidity_second
0,new york,21.0,65.0,14.0,68.0
1,chicago,14.0,68.0,21.0,65.0
2,orlando,35.0,71.0,,
3,baltimore,38.0,75.0,,
4,san diego,,,35.0,71.0


## join

In [162]:
df1 = pd.DataFrame({
    "city": ["new york", "chicago", "orlando"],
    "temperature": [21, 14, 35]
})
df1.set_index("city", inplace=True)
df1

Unnamed: 0_level_0,temperature
city,Unnamed: 1_level_1
new york,21
chicago,14
orlando,35


In [163]:
df2 = pd.DataFrame({
    "city": ["chicago", "new york", "orlando"],
    "temperature": [65, 68, 75]
})
df2.set_index("city", inplace=True)
df2

Unnamed: 0_level_0,temperature
city,Unnamed: 1_level_1
chicago,65
new york,68
orlando,75


In [164]:
df1.join(df2, lsuffix="_l", rsuffix="_r")

Unnamed: 0_level_0,temperature_l,temperature_r
city,Unnamed: 1_level_1,Unnamed: 2_level_1
new york,21,68
chicago,14,65
orlando,35,75


In [165]:
df2 = pd.DataFrame({
    "city": ["chicago", "new york", "orlando"],
    "humidity": [65, 68, 75]
})
df2.set_index("city", inplace=True)
df2

Unnamed: 0_level_0,humidity
city,Unnamed: 1_level_1
chicago,65
new york,68
orlando,75


In [166]:
df1.join(df2, lsuffix="_l", rsuffix="_r")

Unnamed: 0_level_0,temperature,humidity
city,Unnamed: 1_level_1,Unnamed: 2_level_1
new york,21,68
chicago,14,65
orlando,35,75


The related DataFrame.join method, uses merge internally for the index-on-index and index-on-column(s) joins, but joins on indexes by default rather than trying to join on common columns (the default behavior for merge). If you are joining on index, you may wish to use DataFrame.join to save yourself some typing.

merge is a function in the pandas namespace, and it is also available as a DataFrame instance method, with the calling DataFrame being implicitly considered the left object in the join.

These two function calls are completely equivalent:

left.join(right, on=key_or_keys)

pd.merge(left, right, left_on=key_or_keys, right_index=True, how='left', sort=False)