<a href="https://colab.research.google.com/github/Anniikett/ANN-for-Regression_Sklearn/blob/main/3_2_Concatenating_and_Merging_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Concatenating Datasets

Pandas is rich in its support for combining datasets and data with different datatypes. It also provides functionality to support set logic for indexes and relational algebra.

### `concat()` Function

The `concat()` function does the underlying heavy lifting of performing concatenation operations, for two or more dataframes, along an axis. It also allows performing of optional set operations (e.g. union or intersection) of the indexes (if any) on the other axes.

In [None]:
import pandas as pd
import numpy as np

In [None]:
northeast = pd.Series(['CT', 'ME', 'MA', 'NH', 'RI', 'VT', 'NJ', 'NY', 'PA'])
south = pd.Series(['DE', 'FL', 'GA', 'MD', 'NC', 'SC', 'VA', 'DC', 'WV', 'AL', 'KY', 'MS', 'TN', 'AR', 'LA', 'OK', 'TX'])
midwest = pd.Series(['IL', 'IN', 'MN', 'MO', 'NE', 'ND', 'SD', 'IA', 'KS', 'MI', 'OH', 'WI'])
west = pd.Series(['AZ', 'CO', 'ID', 'MT', 'NV', 'NM', 'UT', 'WY', 'AK', 'CA', 'HI', 'OR', 'WA'])

In [None]:
series1 = northeast.append(south)
series1

0     CT
1     ME
2     MA
3     NH
4     RI
5     VT
6     NJ
7     NY
8     PA
0     DE
1     FL
2     GA
3     MD
4     NC
5     SC
6     VA
7     DC
8     WV
9     AL
10    KY
11    MS
12    TN
13    AR
14    LA
15    OK
16    TX
dtype: object

In [None]:
series1.loc[3]

3    NH
3    MD
dtype: object

In [None]:
series1.reset_index()

Unnamed: 0,index,0
0,0,CT
1,1,ME
2,2,MA
3,3,NH
4,4,RI
5,5,VT
6,6,NJ
7,7,NY
8,8,PA
9,0,DE


In [None]:
series1 = series1.reset_index(drop=True)
series1

0     CT
1     ME
2     MA
3     NH
4     RI
5     VT
6     NJ
7     NY
8     PA
9     DE
10    FL
11    GA
12    MD
13    NC
14    SC
15    VA
16    DC
17    WV
18    AL
19    KY
20    MS
21    TN
22    AR
23    LA
24    OK
25    TX
dtype: object

In [None]:
series2 = pd.concat([northeast, south])
series2

0     CT
1     ME
2     MA
3     NH
4     RI
5     VT
6     NJ
7     NY
8     PA
0     DE
1     FL
2     GA
3     MD
4     NC
5     SC
6     VA
7     DC
8     WV
9     AL
10    KY
11    MS
12    TN
13    AR
14    LA
15    OK
16    TX
dtype: object

In [None]:
series2 = pd.concat([northeast, south], ignore_index=True)
series2

0     CT
1     ME
2     MA
3     NH
4     RI
5     VT
6     NJ
7     NY
8     PA
9     DE
10    FL
11    GA
12    MD
13    NC
14    SC
15    VA
16    DC
17    WV
18    AL
19    KY
20    MS
21    TN
22    AR
23    LA
24    OK
25    TX
dtype: object

In [None]:
## Using DataFrames:

In [None]:
pop1 = pd.read_csv("population_01.csv", index_col=0)
pop1

Unnamed: 0_level_0,2010 Census Population
Zip Code ZCTA,Unnamed: 1_level_1
66407,479
72732,4716
50579,2405
46241,30670


In [None]:
pop2 = pd.read_csv("population_02.csv", index_col=0)
pop2

Unnamed: 0_level_0,2010 Census Population
Zip Code ZCTA,Unnamed: 1_level_1
12776,2180
76092,26669
98360,12221
49464,27481


In [None]:
#using append:
pop1.append(pop2)

Unnamed: 0_level_0,2010 Census Population
Zip Code ZCTA,Unnamed: 1_level_1
66407,479
72732,4716
50579,2405
46241,30670
12776,2180
76092,26669
98360,12221
49464,27481


In [None]:
population = pd.read_csv("population_00.csv", index_col=0)
population

Unnamed: 0_level_0,2010 Census Population
Zip Code ZCTA,Unnamed: 1_level_1
57538,322
59916,130
37660,40038
2860,45199


In [None]:
unemployment = pd.read_csv("unemployment.csv", index_col=0)
unemployment

Unnamed: 0_level_0,Unemployment,Participants
Zip,Unnamed: 1_level_1,Unnamed: 2_level_1
2860,0.11,34447
46167,0.02,4800
1097,0.33,42
80808,0.07,4310


In [None]:
population.append(unemployment)

Unnamed: 0,2010 Census Population,Unemployment,Participants
57538,322.0,,
59916,130.0,,
37660,40038.0,,
2860,45199.0,,
2860,,0.11,34447.0
46167,,0.02,4800.0
1097,,0.33,42.0
80808,,0.07,4310.0


In [None]:
#Concact at row level
pd.concat([population, unemployment])

Unnamed: 0,2010 Census Population,Unemployment,Participants
57538,322.0,,
59916,130.0,,
37660,40038.0,,
2860,45199.0,,
2860,,0.11,34447.0
46167,,0.02,4800.0
1097,,0.33,42.0
80808,,0.07,4310.0


In [None]:
#Concact at row level
pd.concat([population, unemployment], axis=0)

Unnamed: 0,2010 Census Population,Unemployment,Participants
57538,322.0,,
59916,130.0,,
37660,40038.0,,
2860,45199.0,,
2860,,0.11,34447.0
46167,,0.02,4800.0
1097,,0.33,42.0
80808,,0.07,4310.0


In [None]:
#Concact at column level
pd.concat([population, unemployment], axis=1)

Unnamed: 0,2010 Census Population,Unemployment,Participants
1097,,0.33,42.0
2860,45199.0,0.11,34447.0
37660,40038.0,,
46167,,0.02,4800.0
57538,322.0,,
59916,130.0,,
80808,,0.07,4310.0


In [None]:
#Multi-Indexes

In [None]:
q1_2013 = pd.read_csv("q1_rainfall_2013.csv", index_col=0)
q1_2013

Unnamed: 0_level_0,Precipitation
Month,Unnamed: 1_level_1
Jan,0.096129
Feb,0.067143
Mar,0.061613


In [None]:
q1_2014 = pd.read_csv("q1_rainfall_2014.csv", index_col=0)
q1_2014

Unnamed: 0_level_0,Precipitation
Month,Unnamed: 1_level_1
Jan,0.050323
Feb,0.082143
Mar,0.70968


In [None]:
q1_1314 = pd.concat([q1_2013, q1_2014])
q1_1314

In [None]:
q1_1314 = pd.concat([q1_2013, q1_2014], keys=[2013,2014])
q1_1314

In [None]:
q1_1314.loc[2013]

In [None]:
q1_1314 = pd.concat([q1_2013, q1_2014], axis="columns")
q1_1314

In [None]:
q1_1314 = pd.concat([q1_2013, q1_2014], keys=[2013,2014], axis=1)
q1_1314

In [None]:
q1_1314[2013]

In [None]:
dict_q1 = {2013:q1_2013, 2014:q1_2014}
dict_q1

In [None]:
rain_1314 = pd.concat(dict_q1, axis=1)
rain_1314

In [None]:
### Inner and Outer Joins

In [None]:
# np.arange

In [None]:
np.arange(4,12,2)

In [None]:
np.arange(8)

In [None]:
np.arange(8).reshape(2,4)

In [None]:
np.arange(8).reshape(2,4) + 0.1

In [None]:
A= np.arange(8).reshape(2,4) + 0.1
A

In [None]:
B = np.arange(6).reshape(2,3) + 0.2
B

In [None]:
C = np.arange(12).reshape(3,4) + 0.3
C

In [None]:
#Stacking arrays Horizontally (#Rows should be same)

np.hstack([A,B])

In [None]:
np.concatenate([A,B], axis=1)

In [None]:
# trying to add A,C with different #rows
np.concatenate([A,C], axis=1)

In [None]:
#Stacking arrays Vertically (#Columns should be same)

np.vstack([A,C])

In [None]:
np.concatenate([A,C], axis=0)

In [None]:
# trying to add B,C with different #columns
np.concatenate([B,C], axis=0)

In [None]:
population

In [None]:
unemployment

In [None]:
pd.concat([population, unemployment], axis=1)

In [None]:
pd.concat([population, unemployment], axis=1, join="outer")

In [None]:
pd.concat([population, unemployment], axis=1, join="inner")

In [None]:
pd.concat([population, unemployment], axis=0, join="inner")

In [None]:
pd.concat([population, unemployment], axis=0, join="outer")