# Grouping Data

## Pandas GroupBy
Groupby mainly refers to a process involving one or more of the following steps they are: 
 

Splitting : It is a process in which we split data into group by applying some conditions on datasets.

Applying : It is a process in which we apply a function to each group independently.

Combining : It is a process in which we combine different datasets after applying groupby and results into a data structure

In [2]:
# importing pandas module
import pandas as pd

# Define a dictionary containing employee data
data1 = {'Name':['Ritika', 'Pranay', 'Ritika', 'Ayush','Rutuja', 'Pranay', 'Ayush', 'Sammy'],
         
        'Age':[27, 24, 22, 32,33, 36, 27, 32],
        'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj','Jaunpur', 'Kanpur', 'Allahabad', 'Aligarh'],
         
        'Qualification':['Msc', 'MCS', 'MCA', 'Phd','B.Tech', 'M.Tech', 'Msc', 'MBA']}

# Convert the dictionary into DataFrame
df = pd.DataFrame(data1)

print(df)

     Name  Age    Address Qualification
0  Ritika   27     Nagpur           Msc
1  Pranay   24     Kanpur           MCS
2  Ritika   22  Allahabad           MCA
3   Ayush   32    Kannuaj           Phd
4  Rutuja   33    Jaunpur        B.Tech
5  Pranay   36     Kanpur        M.Tech
6   Ayush   27  Allahabad           Msc
7   Sammy   32    Aligarh           MBA


### Grouping data with single key

In [3]:
# using groupby function
# with one key

gk=df.groupby('Name')
print(df.groupby('Name').groups)
gk.first()

{'Ayush': [3, 6], 'Pranay': [1, 5], 'Ritika': [0, 2], 'Rutuja': [4], 'Sammy': [7]}


Unnamed: 0_level_0,Age,Address,Qualification
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ayush,32,Kannuaj,Phd
Pranay,24,Kanpur,MCS
Ritika,27,Nagpur,Msc
Rutuja,33,Jaunpur,B.Tech
Sammy,32,Aligarh,MBA


### Grouping data with multiple keys 

In [4]:
# Using multiple keys in
# groupby() function
gk=df.groupby(['Name', 'Qualification'])

print(df.groupby(['Name', 'Qualification']).groups)
gk.first()

{('Ayush', 'Msc'): [6], ('Ayush', 'Phd'): [3], ('Pranay', 'M.Tech'): [5], ('Pranay', 'MCS'): [1], ('Ritika', 'MCA'): [2], ('Ritika', 'Msc'): [0], ('Rutuja', 'B.Tech'): [4], ('Sammy', 'MBA'): [7]}


Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Address
Name,Qualification,Unnamed: 2_level_1,Unnamed: 3_level_1
Ayush,Msc,27,Allahabad
Ayush,Phd,32,Kannuaj
Pranay,M.Tech,36,Kanpur
Pranay,MCS,24,Kanpur
Ritika,MCA,22,Allahabad
Ritika,Msc,27,Nagpur
Rutuja,B.Tech,33,Jaunpur
Sammy,MBA,32,Aligarh


### Grouping data by sorting keys:

In [5]:
# using groupby function
# without using sort.
df.groupby(['Name']).sum(numeric_only=True)

Unnamed: 0_level_0,Age
Name,Unnamed: 1_level_1
Ayush,59
Pranay,60
Ritika,49
Rutuja,33
Sammy,32


In [15]:
# using groupby function
# with sort
df.groupby(['Name'], sort = False).sum(numeric_only=True)

df.groupby(['Name'], sort = True).sum(numeric_only=True)


Unnamed: 0_level_0,Age
Name,Unnamed: 1_level_1
Ayush,59
Pranay,60
Ritika,49
Rutuja,33
Sammy,32


### Grouping data with object attributes

In [6]:
# using keys for grouping
# data

df.groupby('Name').groups

{'Ayush': [3, 6], 'Pranay': [1, 5], 'Ritika': [0, 2], 'Rutuja': [4], 'Sammy': [7]}

### Iterating through groups

In [16]:
# iterating an element
# of group

grp = df.groupby('Name')
for name, group in grp:
	print(name)
	print(group)
	print()

Ayush
    Name  Age    Address Qualification
3  Ayush   32    Kannuaj           Phd
6  Ayush   27  Allahabad           Msc

Pranay
     Name  Age Address Qualification
1  Pranay   24  Kanpur           MCS
5  Pranay   36  Kanpur        M.Tech

Ritika
     Name  Age    Address Qualification
0  Ritika   27     Nagpur           Msc
2  Ritika   22  Allahabad           MCA

Rutuja
     Name  Age  Address Qualification
4  Rutuja   33  Jaunpur        B.Tech

Sammy
    Name  Age  Address Qualification
7  Sammy   32  Aligarh           MBA



In [17]:
# iterating an element
# of group containing
# multiple keys

grp = df.groupby(['Name', 'Qualification'])
for name, group in grp:
	print(name)
	print(group)
	print()

('Ayush', 'Msc')
    Name  Age    Address Qualification
6  Ayush   27  Allahabad           Msc

('Ayush', 'Phd')
    Name  Age  Address Qualification
3  Ayush   32  Kannuaj           Phd

('Pranay', 'M.Tech')
     Name  Age Address Qualification
5  Pranay   36  Kanpur        M.Tech

('Pranay', 'MCS')
     Name  Age Address Qualification
1  Pranay   24  Kanpur           MCS

('Ritika', 'MCA')
     Name  Age    Address Qualification
2  Ritika   22  Allahabad           MCA

('Ritika', 'Msc')
     Name  Age Address Qualification
0  Ritika   27  Nagpur           Msc

('Rutuja', 'B.Tech')
     Name  Age  Address Qualification
4  Rutuja   33  Jaunpur        B.Tech

('Sammy', 'MBA')
    Name  Age  Address Qualification
7  Sammy   32  Aligarh           MBA



### Selecting a groups

In [7]:
# selecting a single group
grp = df.groupby('Name')
grp.get_group('Ritika')

Unnamed: 0,Name,Age,Address,Qualification
0,Ritika,27,Nagpur,Msc
2,Ritika,22,Allahabad,MCA


In [8]:
# selecting object grouped
# on multiple columns

grp = df.groupby(['Name', 'Qualification'])
grp.get_group(('Pranay', 'MCS'))


Unnamed: 0,Name,Age,Address,Qualification
1,Pranay,24,Kanpur,MCS


# Grouping Rows in pandas

In [11]:
# importing Pandas
import pandas as pd

# example dataframe
example = {'Team':['Arsenal', 'Manchester United', 'Arsenal',
				'Arsenal', 'Chelsea', 'Manchester United',
				'Manchester United', 'Chelsea', 'Chelsea', 'Chelsea'],
					
		'Player':['Ozil', 'Pogba', 'Lucas', 'Aubameyang',
					'Hazard', 'Mata', 'Lukaku', 'Morata',
										'Giroud', 'Kante'],
										
		'Goals':[5, 3, 6, 4, 9, 2, 0, 5, 2, 3] }

df = pd.DataFrame(example)

print(df)


                Team      Player  Goals
0            Arsenal        Ozil      5
1  Manchester United       Pogba      3
2            Arsenal       Lucas      6
3            Arsenal  Aubameyang      4
4            Chelsea      Hazard      9
5  Manchester United        Mata      2
6  Manchester United      Lukaku      0
7            Chelsea      Morata      5
8            Chelsea      Giroud      2
9            Chelsea       Kante      3


In [12]:
total_goals=df['Goals'].groupby(df['Team'])
print(total_goals.sum())

Team
Arsenal              15
Chelsea              19
Manchester United     5
Name: Goals, dtype: int64


### Combining multiple columns in Pandas groupby with dictionary

In [13]:
# importing pandas as pd
import pandas as pd

# Creating a dictionary
d = {'id':['1', '2', '3'],
	'Column 1.1':[14, 15, 16],
	'Column 1.2':[10, 10, 10],
	'Column 1.3':[1, 4, 5],
	'Column 2.1':[1, 2, 3],
	'Column 2.2':[10, 10, 10], }

# Converting dictionary into a data-frame
df = pd.DataFrame(d)
print(df)

  id  Column 1.1  Column 1.2  Column 1.3  Column 2.1  Column 2.2
0  1          14          10           1           1          10
1  2          15          10           4           2          10
2  3          16          10           5           3          10


In [14]:
# Creating the groupby dictionary
groupby_dict = {'Column 1.1':'Column 1',
				'Column 1.2':'Column 1',
				'Column 1.3':'Column 1',
				'Column 2.1':'Column 2',
				'Column 2.2':'Column 2' }

# Set the index of df as Column 'id'
df = df.set_index('id')

# Groupby the groupby_dict created above
df = df.groupby(groupby_dict, axis = 1).min()
print(df)

    Column 1  Column 2
id                    
1          1         1
2          4         2
3          5         3


# Concatenating DataFrames

In [11]:
data1 = {'key': ['K0', 'K1', 'K2', 'K3'],
        'Name':['Ayush', 'Pranay', 'Ritika', 'Rutuja'], 
        'Age':[27, 24, 22, 32], 
        'Address':['Nagpur', 'Pune', 'Mumbai', 'Bangalore'], 
        'Qualification':['Msc', 'MCS', 'MCA', 'Phd']} 
   
# Define a dictionary containing employee data 
data2 = {'key': ['K0', 'K1', 'K2', 'K3'],
        'Name':['Abhi', 'Ayushi', 'Dhiraj', 'Hitesh'], 
        'Age':[17, 14, 12, 52], 
        'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'], 
        'Qualification':['Btech', 'B.A', 'Bcom', 'B.hons']} 
 
# Convert the dictionary into DataFrame  
df = pd.DataFrame(data1,index=[0, 1, 2, 3])
 
# Convert the dictionary into DataFrame  
df1 = pd.DataFrame(data2, index=[4, 5, 6, 7])
 
print(df, "\n\n", df1) 


  key    Name  Age    Address Qualification
0  K0   Ayush   27     Nagpur           Msc
1  K1  Pranay   24       Pune           MCS
2  K2  Ritika   22     Mumbai           MCA
3  K3  Rutuja   32  Bangalore           Phd 

   key    Name  Age    Address Qualification
4  K0    Abhi   17     Nagpur         Btech
5  K1  Ayushi   14     Kanpur           B.A
6  K2  Dhiraj   12  Allahabad          Bcom
7  K3  Hitesh   52    Kannuaj        B.hons


In [12]:
# using a .concat() method
frames = [df, df1]

res1 = pd.concat(frames)
res1

Unnamed: 0,key,Name,Age,Address,Qualification
0,K0,Ayush,27,Nagpur,Msc
1,K1,Pranay,24,Pune,MCS
2,K2,Ritika,22,Mumbai,MCA
3,K3,Rutuja,32,Bangalore,Phd
4,K0,Abhi,17,Nagpur,Btech
5,K1,Ayushi,14,Kanpur,B.A
6,K2,Dhiraj,12,Allahabad,Bcom
7,K3,Hitesh,52,Kannuaj,B.hons


In [17]:
# using append function
 
res = df.append(df1)
res

Unnamed: 0,key,Name,Age,Address,Qualification
0,K0,Ayush,27,Nagpur,Msc
1,K1,Pranay,24,Pune,MCS
2,K2,Ritika,22,Mumbai,MCA
3,K3,Rutuja,32,Bangalore,Phd
4,K0,Abhi,17,Nagpur,Btech
5,K1,Ayushi,14,Kanpur,B.A
6,K2,Dhiraj,12,Allahabad,Bcom
7,K3,Hitesh,52,Kannuaj,B.hons


In [13]:
# using keys 
frames = [df, df1 ]
 
res = pd.concat(frames, keys=['x', 'y'])
res

Unnamed: 0,Unnamed: 1,key,Name,Age,Address,Qualification
x,0,K0,Ayush,27,Nagpur,Msc
x,1,K1,Pranay,24,Pune,MCS
x,2,K2,Ritika,22,Mumbai,MCA
x,3,K3,Rutuja,32,Bangalore,Phd
y,4,K0,Abhi,17,Nagpur,Btech
y,5,K1,Ayushi,14,Kanpur,B.A
y,6,K2,Dhiraj,12,Allahabad,Bcom
y,7,K3,Hitesh,52,Kannuaj,B.hons


# Merging DataFrame

In [19]:
# using .merge() function
res = pd.merge(df, df1, on='key')
 
res

Unnamed: 0,key,Name_x,Age_x,Address_x,Qualification_x,Name_y,Age_y,Address_y,Qualification_y
0,K0,Ayush,27,Nagpur,Msc,Abhi,17,Nagpur,Btech
1,K1,Pranay,24,Pune,MCS,Ayushi,14,Kanpur,B.A
2,K2,Ritika,22,Mumbai,MCA,Dhiraj,12,Allahabad,Bcom
3,K3,Rutuja,32,Bangalore,Phd,Hitesh,52,Kannuaj,B.hons


## Merging Using Joins

In [20]:
data1 = {'key': ['K0', 'K1', 'K2', 'K3'],
         'key1': ['K0', 'K1', 'K0', 'K1'],
        'Name':['Ayush', 'Pranay', 'Ritika', 'Rutuja'], 
        'Age':[27, 24, 22, 32], 
        'Address':['Nagpur', 'Pune', 'Mumbai', 'Bangalore'], 
        'Qualification':['Msc', 'MCS', 'MCA', 'Phd']} 
   
# Define a dictionary containing employee data 
data2 = {'key': ['K0', 'K1', 'K2', 'K3'],
         'key1': ['K0', 'K0', 'K0', 'K0'],
        'Name':['Abhi', 'Ayushi', 'Dhiraj', 'Hitesh'], 
        'Age':[17, 14, 12, 52], 
        'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'], 
        'Qualification':['Btech', 'B.A', 'Bcom', 'B.hons']} 
 
# Convert the dictionary into DataFrame  
df = pd.DataFrame(data1,index=[0, 1, 2, 3])
 
# Convert the dictionary into DataFrame  
df1 = pd.DataFrame(data2, index=[4, 5, 6, 7])
 
print(df, "\n\n", df1) 


  key key1    Name  Age    Address Qualification
0  K0   K0   Ayush   27     Nagpur           Msc
1  K1   K1  Pranay   24       Pune           MCS
2  K2   K0  Ritika   22     Mumbai           MCA
3  K3   K1  Rutuja   32  Bangalore           Phd 

   key key1    Name  Age    Address Qualification
4  K0   K0    Abhi   17     Nagpur         Btech
5  K1   K0  Ayushi   14     Kanpur           B.A
6  K2   K0  Dhiraj   12  Allahabad          Bcom
7  K3   K0  Hitesh   52    Kannuaj        B.hons


### Left Join:

In [21]:
# using keys from left frame
res = pd.merge(df, df1, how='left', on=['key', 'key1'])
 
res

Unnamed: 0,key,key1,Name_x,Age_x,Address_x,Qualification_x,Name_y,Age_y,Address_y,Qualification_y
0,K0,K0,Ayush,27,Nagpur,Msc,Abhi,17.0,Nagpur,Btech
1,K1,K1,Pranay,24,Pune,MCS,,,,
2,K2,K0,Ritika,22,Mumbai,MCA,Dhiraj,12.0,Allahabad,Bcom
3,K3,K1,Rutuja,32,Bangalore,Phd,,,,


### Right Join:

In [22]:
# using keys from left frame
res = pd.merge(df, df1, how='right', on=['key', 'key1'])
 
res

Unnamed: 0,key,key1,Name_x,Age_x,Address_x,Qualification_x,Name_y,Age_y,Address_y,Qualification_y
0,K0,K0,Ayush,27.0,Nagpur,Msc,Abhi,17,Nagpur,Btech
1,K1,K0,,,,,Ayushi,14,Kanpur,B.A
2,K2,K0,Ritika,22.0,Mumbai,MCA,Dhiraj,12,Allahabad,Bcom
3,K3,K0,,,,,Hitesh,52,Kannuaj,B.hons


### Outer & Inner

In [23]:
# getting union  of keys
res2 = pd.merge(df, df1, how='outer', on=['key', 'key1'])
 
res2

Unnamed: 0,key,key1,Name_x,Age_x,Address_x,Qualification_x,Name_y,Age_y,Address_y,Qualification_y
0,K0,K0,Ayush,27.0,Nagpur,Msc,Abhi,17.0,Nagpur,Btech
1,K1,K1,Pranay,24.0,Pune,MCS,,,,
2,K2,K0,Ritika,22.0,Mumbai,MCA,Dhiraj,12.0,Allahabad,Bcom
3,K3,K1,Rutuja,32.0,Bangalore,Phd,,,,
4,K1,K0,,,,,Ayushi,14.0,Kanpur,B.A
5,K3,K0,,,,,Hitesh,52.0,Kannuaj,B.hons


In [24]:
# getting intersection  of keys
res2 = pd.merge(df, df1, how='inner', on=['key', 'key1'])
 
res2

Unnamed: 0,key,key1,Name_x,Age_x,Address_x,Qualification_x,Name_y,Age_y,Address_y,Qualification_y
0,K0,K0,Ayush,27,Nagpur,Msc,Abhi,17,Nagpur,Btech
1,K2,K0,Ritika,22,Mumbai,MCA,Dhiraj,12,Allahabad,Bcom


# Joining DataFrame

In [25]:
data1 = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'], 
        'Age':[27, 24, 22, 32]} 
    
# Define a dictionary containing employee data 
data2 = {'Address':['Allahabad', 'Kannuaj', 'Allahabad', 'Kannuaj'], 
        'Qualification':['MCA', 'Phd', 'Bcom', 'B.hons']} 
  
# Convert the dictionary into DataFrame  
df = pd.DataFrame(data1,index=['K0', 'K1', 'K2', 'K3'])
  
# Convert the dictionary into DataFrame  
df1 = pd.DataFrame(data2, index=['K0', 'K2', 'K3', 'K4'])
 
 
print(df, "\n\n", df1) 

      Name  Age
K0     Jai   27
K1  Princi   24
K2  Gaurav   22
K3    Anuj   32 

       Address Qualification
K0  Allahabad           MCA
K2    Kannuaj           Phd
K3  Allahabad          Bcom
K4    Kannuaj        B.hons


In [26]:
# joining dataframe
res = df.join(df1)
 
res

Unnamed: 0,Name,Age,Address,Qualification
K0,Jai,27,Allahabad,MCA
K1,Princi,24,,
K2,Gaurav,22,Kannuaj,Phd
K3,Anuj,32,Allahabad,Bcom


In [27]:
# getting union
res1 = df.join(df1, how='outer')
 
res1

Unnamed: 0,Name,Age,Address,Qualification
K0,Jai,27.0,Allahabad,MCA
K1,Princi,24.0,,
K2,Gaurav,22.0,Kannuaj,Phd
K3,Anuj,32.0,Allahabad,Bcom
K4,,,Kannuaj,B.hons


## dataframe.append()

In [28]:
# to append df2 at the end of df1 dataframe
df.append(df1)


Unnamed: 0,Name,Age,Address,Qualification
K0,Jai,27.0,,
K1,Princi,24.0,,
K2,Gaurav,22.0,,
K3,Anuj,32.0,,
K0,,,Allahabad,MCA
K2,,,Kannuaj,Phd
K3,,,Allahabad,Bcom
K4,,,Kannuaj,B.hons


### str.join()

In [37]:
data = pd.read_csv('C:/Users/Pranay/Downloads/penguins.csv')
df=pd.DataFrame(data)
df

Unnamed: 0,Island,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
0,Torgersen,39.1,18.7,181.0,3750.0,0
1,Torgersen,39.5,17.4,186.0,3800.0,0
2,Torgersen,40.3,18.0,195.0,3250.0,0
3,Torgersen,,,,,0
4,Torgersen,36.7,19.3,193.0,3450.0,0
...,...,...,...,...,...,...
339,Dream,55.8,19.8,207.0,4000.0,2
340,Dream,43.5,18.1,202.0,3400.0,2
341,Dream,49.6,18.2,193.0,3775.0,2
342,Dream,50.8,19.0,210.0,4100.0,2


In [39]:
data.dropna(inplace = True)
    
# splitting string and overwriting 
data["Island"]= data["Island"].str.split("T")
  
# joining with "_"
data["Island"]= data["Island"].str.join("_")
  
# display
data

Unnamed: 0,Island,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
0,_orgersen,39.1,18.7,181.0,3750.0,0
1,_orgersen,39.5,17.4,186.0,3800.0,0
2,_orgersen,40.3,18.0,195.0,3250.0,0
4,_orgersen,36.7,19.3,193.0,3450.0,0
5,_orgersen,39.3,20.6,190.0,3650.0,0
...,...,...,...,...,...,...
339,Dream,55.8,19.8,207.0,4000.0,2
340,Dream,43.5,18.1,202.0,3400.0,2
341,Dream,49.6,18.2,193.0,3775.0,2
342,Dream,50.8,19.0,210.0,4100.0,2
