# Connecting multiple data frames 

In [1]:
import numpy as np
import pandas as pd

In [2]:
frame1 = pd.DataFrame({'id': ['ball' , 'pencil', 'pen','mug', 'ashtray'], 'price': [12.33, 11.44, 33.21, 13.23, 33.62]})
frame2 = pd.DataFrame({'id': ['pencil' , 'pencil', 'ball', 'pen'], 'color': ['white', 'red', 'red', 'black']})

In [4]:
frame1 , frame2

(        id  price
 0     ball  12.33
 1   pencil  11.44
 2      pen  33.21
 3      mug  13.23
 4  ashtray  33.62,
        id  color
 0  pencil  white
 1  pencil    red
 2    ball    red
 3     pen  black)

## Merged 2 DataFrames 

In [3]:
pd.merge(frame1, frame2)

Unnamed: 0,id,price,color
0,ball,12.33,red
1,pencil,11.44,white
2,pencil,11.44,red
3,pen,33.21,black


* This picked the common variable in the 2 dataframes ('id') and based on that it is doing merge
* In the output it only providing the data for those id's which are common

In [5]:
# We mentioned the variable based on which we are doing the merge
frame1.merge(frame2 , on = "id" )

Unnamed: 0,id,price,color
0,ball,12.33,red
1,pencil,11.44,white
2,pencil,11.44,red
3,pen,33.21,black


In [6]:
# Added another column in the dataframe "frame1"
series1 = pd.Series(['OMG','ABC','ABC','POD','POD'])
frame1['brand'] = series1 
frame1

Unnamed: 0,id,price,brand
0,ball,12.33,OMG
1,pencil,11.44,ABC
2,pen,33.21,ABC
3,mug,13.23,POD
4,ashtray,33.62,POD


In [7]:
series2 = pd.Series(['OMG','POD','ABC','POD'])
frame2['brand'] = series2
frame2

Unnamed: 0,id,color,brand
0,pencil,white,OMG
1,pencil,red,POD
2,ball,red,ABC
3,pen,black,POD


In [9]:
frame1 , frame2

(        id  price brand
 0     ball  12.33   OMG
 1   pencil  11.44   ABC
 2      pen  33.21   ABC
 3      mug  13.23   POD
 4  ashtray  33.62   POD,
        id  color brand
 0  pencil  white   OMG
 1  pencil    red   POD
 2    ball    red   ABC
 3     pen  black   POD)

In [8]:
pd.merge(frame1,frame2)

Unnamed: 0,id,price,brand,color


* It have given no data because the commnon columns in the 2 dataframes frame1 & frame2 do not have common values
* Here the common columns were 'id' & 'brand'. And it is taking the common column as the combination of these 2 columns. And didn't got the common combination across the two dataframes

In [10]:
# Here we have decided that based on which common variable we want to do the merge
pd.merge(frame1,frame2, on = 'id')

Unnamed: 0,id,price,brand_x,color,brand_y
0,ball,12.33,OMG,red,ABC
1,pencil,11.44,ABC,white,OMG
2,pencil,11.44,ABC,red,POD
3,pen,33.21,ABC,black,POD


#### What if we want to merge based on different columns in the 2 dataframes 

In [11]:
frame2

Unnamed: 0,id,color,brand
0,pencil,white,OMG
1,pencil,red,POD
2,ball,red,ABC
3,pen,black,POD


In [12]:
# Here we have renamed column labels
frame2.columns = ['sid' ,'color' ,'brand'] 

In [13]:
frame2

Unnamed: 0,sid,color,brand
0,pencil,white,OMG
1,pencil,red,POD
2,ball,red,ABC
3,pen,black,POD


In [14]:
# Merging 2 dataframes when the common columns in both the dataframes have different labels
pd.merge(frame1 , frame2, left_on = 'id', right_on = 'sid')

Unnamed: 0,id,price,brand_x,sid,color,brand_y
0,ball,12.33,OMG,ball,red,ABC
1,pencil,11.44,ABC,pencil,white,OMG
2,pencil,11.44,ABC,pencil,red,POD
3,pen,33.21,ABC,pen,black,POD


#### Left Joins 

In [15]:
frame2.columns = ['id' ,'color' ,'brand'] 

In [16]:
pd.merge(frame1 , frame2 , on = 'id' , how = 'left')

Unnamed: 0,id,price,brand_x,color,brand_y
0,ball,12.33,OMG,red,ABC
1,pencil,11.44,ABC,white,OMG
2,pencil,11.44,ABC,red,POD
3,pen,33.21,ABC,black,POD
4,mug,13.23,POD,,
5,ashtray,33.62,POD,,


#### Right Join 

In [17]:
pd.merge(frame1 , frame2 , on = 'id' , how = 'right')

Unnamed: 0,id,price,brand_x,color,brand_y
0,pencil,11.44,ABC,white,OMG
1,pencil,11.44,ABC,red,POD
2,ball,12.33,OMG,red,ABC
3,pen,33.21,ABC,black,POD


#### Outer Join

In [18]:
# It is like the union of both the dataframes
pd.merge(frame1, frame2 , on = ['id', 'brand'] , how = 'outer')

Unnamed: 0,id,price,brand,color
0,ball,12.33,OMG,
1,pencil,11.44,ABC,
2,pen,33.21,ABC,
3,mug,13.23,POD,
4,ashtray,33.62,POD,
5,pencil,,OMG,white
6,pencil,,POD,red
7,ball,,ABC,red
8,pen,,POD,black


#### Concatenation of 2 dataframes 

In [19]:
import pandas as pd
import numpy as np

In [20]:
frame1 = pd.DataFrame(np.random.rand(9).reshape(3,3), index= [1,2,3], columns = ['A','B','C'])
frame2 = pd.DataFrame(np.random.rand(9).reshape((3,3)), index = [4,5,6] , columns = ['A','B','C'])

In [21]:
frame1

Unnamed: 0,A,B,C
1,0.869027,0.946453,0.596921
2,0.337669,0.623442,0.813436
3,0.799269,0.484601,0.520155


In [22]:
frame2

Unnamed: 0,A,B,C
4,0.530807,0.407294,0.646242
5,0.360031,0.742138,0.398178
6,0.981577,0.93507,0.359843


In [23]:
# By default it performs horizontal concatenation
# Concatenation based on rows : rows have increased
pd.concat([frame1, frame2])

Unnamed: 0,A,B,C
1,0.869027,0.946453,0.596921
2,0.337669,0.623442,0.813436
3,0.799269,0.484601,0.520155
4,0.530807,0.407294,0.646242
5,0.360031,0.742138,0.398178
6,0.981577,0.93507,0.359843


In [24]:
# Here we can observe that we have put 2 dataframes under the squared bracket.
pd.concat([frame1, frame2] , axis =0)

Unnamed: 0,A,B,C
1,0.869027,0.946453,0.596921
2,0.337669,0.623442,0.813436
3,0.799269,0.484601,0.520155
4,0.530807,0.407294,0.646242
5,0.360031,0.742138,0.398178
6,0.981577,0.93507,0.359843


In [25]:
# Vertical concatenation
# Concatenation based on columns : columns have increased
pd.concat([frame1, frame2] ,axis=1 )

Unnamed: 0,A,B,C,A.1,B.1,C.1
1,0.869027,0.946453,0.596921,,,
2,0.337669,0.623442,0.813436,,,
3,0.799269,0.484601,0.520155,,,
4,,,,0.530807,0.407294,0.646242
5,,,,0.360031,0.742138,0.398178
6,,,,0.981577,0.93507,0.359843


In [26]:
pd.concat([frame1, frame2] ,axis=1 , ignore_index = True)

Unnamed: 0,0,1,2,3,4,5
1,0.869027,0.946453,0.596921,,,
2,0.337669,0.623442,0.813436,,,
3,0.799269,0.484601,0.520155,,,
4,,,,0.530807,0.407294,0.646242
5,,,,0.360031,0.742138,0.398178
6,,,,0.981577,0.93507,0.359843


* When you concatenate DataFrames using pd.concat with ignore_index=True, it not only resets the index but also removes the column labels associated with the original indices.
* The resulting DataFrame will have default integer labels for both rows and columns.

In [27]:
frame3 = pd.DataFrame(np.random.rand(9).reshape(3,3), index= [1,2,3], columns = ['A','B','C'])
frame4 = pd.DataFrame(np.random.rand(9).reshape((3,3)), index = [1,2,3] , columns = ['A','B','C'])

In [28]:
pd.concat([frame3,frame4] , axis = 1)

Unnamed: 0,A,B,C,A.1,B.1,C.1
1,0.842862,0.259214,0.031051,0.434985,0.782834,0.046169
2,0.571986,0.975803,0.290094,0.614254,0.708972,0.195129
3,0.081118,0.462781,0.029655,0.972961,0.357888,0.899907


# DataFrame Display settings

In [29]:
pd.options.display.max_columns = 200
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

* when you print a DataFrame or a similar data structure using pandas, it will display up to 200 columns before truncating or summarizing the display. 
* This can be useful when working with wide datasets with a large number of columns, as it ensures that you can see more information at once.
* **pd.options** :  This refers to the options or settings within the pandas library
* **pd.options.display**: This is a submodule within the pandas library that provides options related to the display of data in various formats.
* **pd.options.display.max_columns** : This is an option within the display module that sets the maximum number of columns to be displayed when you print a DataFrame or a similar data structure. By default, pandas may truncate the display of columns if there are too many, and this option allows you to override that behavior.
***
**max_colwidth = None**
* This is an option within the display module that sets the maximum width (in characters) of a column when displayed.
* By default, pandas may truncate the display of column contents if they are too long, and this option allows you to control that behavior.
* setting this option to None means that there is no maximum width enforced for the display of column contents. In other words, it allows pandas to display the full content of each cell in a column, regardless of its length.