## Pandas.apply()

### For dataframe

df.apply(func, axis=0, raw=False, result_type=None, args=(), **kwds)

### For series

series.apply(func, convert_dtype=True, args=(), **kwds)

### Explaning:
* func: The function to apply. This can be a NumPy function, a Python function, or a callable object.
* axis: The axis along which the function will be applied. Use axis=0 for applying the function to each column (default), and axis=1 for applying the function to each row.
* convert_dtype: For Series, this parameter controls whether to convert the data type of the result to the original data type (default is True).
* args: Additional arguments to pass to the function.

In [1]:
import pandas as pd

# Create a sample DataFrame
data = {'A': [1, 2, 3], 'B': [4, 5, 6]}
df = pd.DataFrame(data)

# Define a function to double the values
def double_values(x):
    return x * 2

# Apply the function to each column
result_df = df.apply(double_values)

print("Original DataFrame:\n", df)
print("\nResult after applying the function:\n", result_df)


Original DataFrame:
    A  B
0  1  4
1  2  5
2  3  6

Result after applying the function:
    A   B
0  2   8
1  4  10
2  6  12


In [2]:
import pandas as pd

# Create a sample Series
s = pd.Series([1, 2, 3])

# Define a function to square the values
def square_values(x):
    return x**2

# Apply the function to the Series
result_series = s.apply(square_values)

print("Original Series:\n", s)
print("\nResult after applying the function:\n", result_series)


Original Series:
 0    1
1    2
2    3
dtype: int64

Result after applying the function:
 0    1
1    4
2    9
dtype: int64


In [6]:
import pandas as pd

df=pd.read_csv('nba.csv')
df1=df.squeeze()
df1

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [13]:
import pandas as pd

data={'A': [1, 2, 3], 'B': [4, 5, 6]}
df=pd.DataFrame(data)
df.apply(lambda x: x**2)

Unnamed: 0,A,B
0,1,16
1,4,25
2,9,36


In [30]:
import pandas as pd
data = {'A': [1, 2, 3],
        'B': [4, 5, 6],
        'C': [7, 8, 9]}  
def func(row):
#     return row['A']+row['B']+row['C']
    return row.sum()
df=pd.DataFrame(data)
df['add']=df.apply(func,axis=1)
df

Unnamed: 0,A,B,C,add
0,1,4,7,12
1,2,5,8,15
2,3,6,9,18


In [33]:
import pandas as pd
data = {'A': [1, 2, 3],
        'B': [4, 5, 6],
        'C': [7, 8, 9]}  
def func(column):
#     return row['A']+row['B']+row['C']
    return column.sum()
df=pd.DataFrame(data)
df1=df.apply(func)
df1


A     6
B    15
C    24
dtype: int64

In [39]:
import pandas as pd
data = {'A': [1, 2, 3],
        'B': [4, 5, 6],
        'C': [7, 8, 9]}  
# def func(row):
# #     return row['A']+row['B']+row['C']
#     return row.sum()
df=pd.DataFrame(data)
df['add']=df.apply(lambda row: row.mean(),axis=1)
df

Unnamed: 0,A,B,C,add
0,1,4,7,4.0
1,2,5,8,5.0
2,3,6,9,6.0


In [41]:
import pandas as pd
data = {'A': [1, 2, 3],
        'B': [4, 5, 6],
        'C': [7, 8, 9]}  
def func(column):
#     return row['A']+row['B']+row['C']
    return column.sum()
df=pd.DataFrame(data)
df1=df.apply(lambda column:column.mean())
df1

A    2.0
B    5.0
C    8.0
dtype: float64

In [50]:
import pandas as pd
import numpy as np
data = {'A': [1, 2, 3],
        'B': [4, 5, 6],
        'C': [7, 8, 9]}  
df=pd.DataFrame(data)
df['add']=df.apply(np.std,axis=1)
df

Unnamed: 0,A,B,C,add
0,1,4,7,2.44949
1,2,5,8,2.44949
2,3,6,9,2.44949


## Normalization 

* column A has values of range from 0 to 100 
* column B hasa range from 0 to 10000
* to unserstand and compare this two columns we need to normalize 
* Normalization brings in scale of 0 to 1 
![Normalization-Formula.jpg](attachment:Normalization-Formula.jpg)


In [77]:
import pandas as pd
import numpy as np 

def normalize(x,y):
    x_new = ((x - np.mean([x, y])) /
             (max(x, y) - min(x, y)))
    return x_new
def main():
    data={
        'X':[1,2,3],
        'Y':[45,69,89]
    }
    df=pd.DataFrame(data)
#     print(df)
    
    df['Normalized value']=df.apply(lambda row: normalize(row['X'],row['Y']),axis=1)
    print(df)
    
if __name__== '__main__':
    main()

   X   Y  Normalized value
0  1  45              -0.5
1  2  69              -0.5
2  3  89              -0.5


In [17]:
import pandas as pd

# sr=pd.Series(["akash","anup","basavaraj","nirmala"])
sr=pd.Series([11,23,4,5,6,7,8,9,10,40,50,None])

index_for_series=pd.date_range('2010-10-09 08:45',periods=12,freq='Y')
# index_for_series

sr.index=index_for_series
res=sr.apply(lambda x: True if x>30 else False)

res

2010-12-31 08:45:00    False
2011-12-31 08:45:00    False
2012-12-31 08:45:00    False
2013-12-31 08:45:00    False
2014-12-31 08:45:00    False
2015-12-31 08:45:00    False
2016-12-31 08:45:00    False
2017-12-31 08:45:00    False
2018-12-31 08:45:00    False
2019-12-31 08:45:00     True
2020-12-31 08:45:00     True
2021-12-31 08:45:00    False
Freq: A-DEC, dtype: bool

## df.aggregate()




In [38]:
import pandas as pd
df=pd.read_csv('nba.csv')
df[:10].describe()

Unnamed: 0,Number,Age,Weight,Salary
count,10.0,10.0,10.0,9.0
mean,39.9,24.7,215.9,4585179.0
std,33.124177,2.945807,23.750789,3677792.0
min,0.0,21.0,180.0,1148640.0
25%,16.0,22.0,193.75,1824360.0
50%,33.0,25.0,225.5,3431040.0
75%,51.5,26.5,235.0,6796117.0
max,99.0,29.0,240.0,12000000.0


In [47]:
df1=df.aggregate({
    'Number':['sum','min','max','count','std','mean','median','var'],
    'Age':['sum','min','max','count','std','mean','median','var'],
    'Weight':['sum','min','max','count','std','mean','median','var'],
    'Salary':['sum','min','max','count','std','mean','var','median']
})
df1

Unnamed: 0,Number,Age,Weight,Salary
sum,8079.0,12311.0,101236.0,2159837000.0
min,0.0,19.0,161.0,30888.0
max,99.0,40.0,307.0,25000000.0
count,457.0,457.0,457.0,446.0
std,15.96609,4.404016,26.368343,5229238.0
mean,17.678337,26.938731,221.522976,4842684.0
median,13.0,26.0,220.0,2839073.0
var,254.916043,19.395361,695.289493,27344930000000.0


## df.mean()
Syntax: DataFrame.mean(axis=None, skipna=None, level=None, numeric_only=None, **kwargs)

Parameters :

axis : {index (0), columns (1)}
skipna : Exclude NA/null values when computing the result
level : If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a Series
numeric_only : Include only float, int, boolean columns. If None, will attempt to use everything, then use only numeric data. Not implemented for Series.
Returns : mean : Series or DataFrame (if level specified)

In [58]:
# importing pandas as pd 
import pandas as pd 

# Creating the dataframe 
df = pd.DataFrame({"A":[12, 4, 5, 44, 1], 
				"B":[5, 2, 54, 3, 2], 
				"C":[20, 16, 7, 3, 8], 
				"D":[14, 3, 17, 2, 6]}) 

# Print the dataframe 
df 


Unnamed: 0,A,B,C,D
0,12,5,20,14
1,4,2,16,3
2,5,54,7,17
3,44,3,3,2
4,1,2,8,6


In [59]:
df.mean(axis=0)

A    13.2
B    13.2
C    10.8
D     8.4
dtype: float64

In [55]:
df.mean(axis=1)

0    12.75
1     6.25
2    20.75
3    13.00
4     4.25
dtype: float64

In [64]:
df = pd.DataFrame({"A":[12, 4, 5, 44, None], 
				"B":[5, 2, 54, 3, 2], 
				"C":[20, 16, 7, 3, 8], 
				"D":[14, 3, 17, 2, 6]}) 
df

Unnamed: 0,A,B,C,D
0,12.0,5,20,14
1,4.0,2,16,3
2,5.0,54,7,17
3,44.0,3,3,2
4,,2,8,6


In [65]:
df.mean(axis=0,skipna=True)

A    16.25
B    13.20
C    10.80
D     8.40
dtype: float64

In [68]:
df.mean(axis=1,skipna=True)

0    12.750000
1     6.250000
2    20.750000
3    13.000000
4     5.333333
dtype: float64

## pandas series.mean()

Syntax: Series.mean(axis=None, skipna=None, level=None, numeric_only=None, **kwargs)

Parameter :
* axis : Axis for the function to be applied on.
* skipna : Exclude NA/null values when computing the result.
* level : If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a scalar.
* numeric_only : Include only float, int, boolean columns.
* **kwargs : Additional keyword arguments to be passed to the function.


Returns : mean : scalar or Series (if level specified)

In [81]:
# importing pandas as pd 
import pandas as pd 

# Creating the Series 
sr = pd.Series([10, 25, 3, 25, 24, 6,None]) 

# Create the Index 
index_ = ['Coca Cola', 'Sprite', 'Coke', 'Fanta', 'Dew', 'ThumbsUp','hfh'] 

# set the index 
sr.index = index_ 

# Print the series 
print(sr) 


Coca Cola    10.0
Sprite       25.0
Coke          3.0
Fanta        25.0
Dew          24.0
ThumbsUp      6.0
hfh           NaN
dtype: float64


In [82]:
sr.mean(axis=0,skipna=True)

15.5

## df.sem()
* standard error of the mean
* sem=std/√n
* number od sample in column/row[]count]

In [93]:
import pandas as pd

# Create a DataFrame
data = {'A': [1, 2, 3, 4, 5], 'B': [5, 4, 3, 2, None]}
df = pd.DataFrame(data)

# Calculate the Mean Absolute Deviation for each column
mad_result = df.sem(axis=1,skipna=False)

print(mad_result)


0    2.0
1    1.0
2    0.0
3    1.0
4    NaN
dtype: float64


## series.values_count()
* return unique value count of each series 
## df.index.values_count()
* return unique value count of each index 

In [94]:
# importing pandas as pd 
import pandas as pd 

# Creating the Series 
sr = pd.Series(['New York', 'Chicago', 'Toronto', 'Lisbon', 'Rio', 'Chicago', 'Lisbon']) 

# Print the series 
print(sr) 


0    New York
1     Chicago
2     Toronto
3      Lisbon
4         Rio
5     Chicago
6      Lisbon
dtype: object


In [96]:
sr.value_counts()

Chicago     2
Lisbon      2
New York    1
Toronto     1
Rio         1
Name: count, dtype: int64

In [97]:
sr.index.value_counts()

0    1
1    1
2    1
3    1
4    1
5    1
6    1
Name: count, dtype: int64

In [104]:
import pandas as pd
df=pd.read_csv('nba.csv')
df1=df['Name']
df1.value_counts()

Name
Avery Bradley       1
Kyle Korver         1
Al Horford          1
Kirk Hinrich        1
Tim Hardaway Jr.    1
                   ..
Eric Moreland       1
Ben McLemore        1
Kosta Koufos        1
Rudy Gay            1
Jeff Withey         1
Name: count, Length: 457, dtype: int64

In [2]:
# importing pandas library
import pandas as pd

# creating and initializing a list
values= [['Rohan',455],['Elvish',250],['Deepak',495],
		['Soni',400],['Radhika',350],['Vansh',450]] 

# creating a pandas dataframe
df = pd.DataFrame(values,columns=['Name','Total_Marks'])
# Applying lambda function to find 
# percentage of 'Total_Marks' column 
# using df.assign()
df = df.assign(Percentage = lambda x: (x['Total_Marks'] /500 * 100))

# displaying the data frame
df


Unnamed: 0,Name,Total_Marks,Percentage
0,Rohan,455,91.0
1,Elvish,250,50.0
2,Deepak,495,99.0
3,Soni,400,80.0
4,Radhika,350,70.0
5,Vansh,450,90.0


In [28]:
import pandas as pd

values_list = [[15, 2.5, 100], [20, 4.5, 50], [25, 5.2, 80],
               [45, 5.8, 48], [40, 6.3, 70], [41, 6.4, 90],
               [51, 2.3, 111]]

df=pd.DataFrame(values_list,columns=['Field1','Field2','Field3'])

df=df.assign(new=lambda x: (x['Field1']*x['Field2']*x['Field3']))
df=df.apply(lambda x: x*x if x.name in [2,3] else x,axis=1)

df

Unnamed: 0,Field1,Field2,Field3,new
0,15.0,2.5,100.0,3750.0
1,20.0,4.5,50.0,4500.0
2,625.0,27.04,6400.0,108160000.0
3,2025.0,33.64,2304.0,156950784.0
4,40.0,6.3,70.0,17640.0
5,41.0,6.4,90.0,23616.0
6,51.0,2.3,111.0,13020.3


## df.insert()
DataFrame.insert(loc, column, value, allow_duplicates=False)
## df.assign()


In [54]:
import pandas as pd

data = {'Name': ['Jai', 'Princi', 'Gaurav', 'Anuj'],
		'Height': [5.1, 6.2, 5.1, 5.2],
		'Qualification': ['Msc', 'MA', 'Msc', 'Msc']}

df = pd.DataFrame(data)
print(df)


     Name  Height Qualification
0     Jai     5.1           Msc
1  Princi     6.2            MA
2  Gaurav     5.1           Msc
3    Anuj     5.2           Msc


In [55]:
df.insert(3,'Age',[23,45,67,67],allow_duplicates=False)

In [61]:
df=df.assign(new_column=lambda x: x['Height']*x['Age'],new_column2=lambda x:x['Name']+'_ak')
df

Unnamed: 0,Name,Height,Qualification,Age,new_column,new_column2
0,Jai,5.1,Msc,23,117.3,Jai_ak
1,Princi,6.2,MA,45,279.0,Princi_ak
2,Gaurav,5.1,Msc,67,341.7,Gaurav_ak
3,Anuj,5.2,Msc,67,348.4,Anuj_ak


In [70]:
import pandas as pd
 
# Define a dictionary containing Students data
data = {'Name': ['Jai', 'Princi', 'Gaurav', 'Anuj'],
        'Height': [5.1, 6.2, 5.1, 5.2],
        'Qualification': ['Msc', 'MA', 'Msc', 'Msc']}
df=pd.DataFrame(data)
Address={ 'Jai':'Delhi', 'Princi':'Bangalore',
           'Patna': 'Gaurav', 'Chennai': 'Anuj'}
df['Address']=df['Name'].str.lower().map(Address)
df

Unnamed: 0,Name,Height,Qualification,Address
0,Jai,5.1,Msc,
1,Princi,6.2,MA,
2,Gaurav,5.1,Msc,
3,Anuj,5.2,Msc,


In [81]:
# Import pandas package
import pandas as pd

# Define a dictionary containing Students data
data = {'Name': ['Jai', 'Princi', 'Gaurav', 'Anuj'],
        'Height': [5.1, 6.2, 5.1, 5.2],
        'Qualification': ['Msc', 'MA', 'Msc', 'Msc']}

# Define a dictionary with key values of
# an existing column and their respective
# value pairs as the values for our new column.
# address = {0:"Delhi",1:'Banglore'}
address=[1,2,3,4]

# Convert the dictionary into DataFrame
df = pd.DataFrame(data)

# Provide 'Address' as the column name
df.loc[:,'Address'] = address

# Observe the output
print(df)


     Name  Height Qualification  Address
0     Jai     5.1           Msc        1
1  Princi     6.2            MA        2
2  Gaurav     5.1           Msc        3
3    Anuj     5.2           Msc        4


In [89]:


import pandas as pd
 
data = {'Name': ['Jai', 'Princi', 'Gaurav', 'Anuj'],
        'Height': [5.1, 6.2, 5.1, 5.2],
        'Qualification': ['Msc', 'MA', 'Msc', 'Msc'],
        'Address': ['Delhi', 'Bangalore', 'Chennai', 'Patna']}
 
df = pd.DataFrame(data)
Age=[23,45,68,90]
State=['KAr','Tam','Goa','Tep']
new_data={'Age':Age,'State':State}
df=df.assign(**new_data)
df

Unnamed: 0,Name,Height,Qualification,Address,Age,State
0,Jai,5.1,Msc,Delhi,23,KAr
1,Princi,6.2,MA,Bangalore,45,Tam
2,Gaurav,5.1,Msc,Chennai,68,Goa
3,Anuj,5.2,Msc,Patna,90,Tep


## df.drop()
### Syntax: DataFrame.drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors=’raise’)
* labels: String or list of strings referring row or column name.
* axis: int or string value, 0 ‘index’ for Rows and 1 ‘columns’ for Columns.
* index or columns: Single label or list. index or columns are an alternative to axis and cannot be used together.
* level: Used to specify level in case data f* rame is having multiple level index.
* inplace: Makes changes in original Data Frame if True.
* errors: Ignores error if any value from the list doesn’t exists and drops rest of the values when errors = ‘ignore’
* Return type: Dataframe with dropped values

In [91]:
# importing pandas module
import pandas as pd

# making data frame from csv file
data = pd.read_csv("nba.csv", index_col="Name")
data


Unnamed: 0_level_0,Team,Number,Position,Age,Height,Weight,College,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...
Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [92]:
data.drop(['Avery Bradley','Jae Crowder','R.J. Hunter'],inplace=True)

In [93]:
data

Unnamed: 0_level_0,Team,Number,Position,Age,Height,Weight,College,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,,12000000.0
Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0
...,...,...,...,...,...,...,...,...
Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [94]:
data.drop(['Team','Number'],inplace=True,axis=1)

In [95]:
data

Unnamed: 0_level_0,Position,Age,Height,Weight,College,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
John Holland,SG,27.0,6-5,205.0,Boston University,
Jonas Jerebko,PF,29.0,6-10,231.0,,5000000.0
Amir Johnson,PF,29.0,6-9,240.0,,12000000.0
Jordan Mickey,PF,21.0,6-8,235.0,LSU,1170960.0
Kelly Olynyk,C,25.0,7-0,238.0,Gonzaga,2165160.0
...,...,...,...,...,...,...
Shelvin Mack,PG,26.0,6-3,203.0,Butler,2433333.0
Raul Neto,PG,24.0,6-1,179.0,,900000.0
Tibor Pleiss,C,26.0,7-3,256.0,,2900000.0
Jeff Withey,C,26.0,7-0,231.0,Kansas,947276.0


## Df.truncate()
### Syntax: DataFrame.truncate(before=None, after=None, axis=None, copy=True)

Parameter :
* before : Truncate all rows before this index value.
* after : Truncate all rows after this index value.
* axis : Axis to truncate. Truncates the index (rows) by default.
* copy : Return a copy of the truncated section.


Returns : The truncated Series or DataFrame.

In [114]:
import pandas as pd

df = pd.DataFrame({'Weight':[45, 88, 56, 15, 71], 
                   'Name':['Sam', 'Andrea', 'Alex', 'Robin', 'Kia'], 
                   'Age':[14, 25, 55, 8, 21]}) 
df.index= pd.date_range('2010-10-09 08:45', periods = 5, freq ='H') 
df=df.truncate(before='2010-01-01 09:45:00',after='2010-01-01 11:45:00')
df

Unnamed: 0,Weight,Name,Age


In [122]:
import pandas as pd

df = pd.DataFrame({'Weight': [45, 88, 56, 15, 71],
                   'Name': ['Sam', 'Andrea', 'Alex', 'Robin', 'Kia'],
                   'Age': [14, 25, 55, 8, 21]})
df.index = pd.date_range('2010-01-01 08:45', periods=5, freq='H')

# Truncate the DataFrame and assign the result back to df
df = df.truncate(before='2010-01-01 09:45:00',after='2010-01-01 11:45:00')

# Display the resulting DataFrame
print(df)


                     Weight    Name  Age
2010-01-01 09:45:00      88  Andrea   25
2010-01-01 10:45:00      56    Alex   55
2010-01-01 11:45:00      15   Robin    8


In [123]:
# importing pandas as pd 
import pandas as pd 

# Creating the DataFrame 
df = pd.DataFrame({"A":[12, 4, 5, None, 1], 
				"B":[7, 2, 54, 3, None], 
				"C":[20, 16, 11, 3, 8], 
				"D":[14, 3, None, 2, 6]}) 

# Create the index 
index_ = ['Row_1', 'Row_2', 'Row_3', 'Row_4', 'Row_5'] 

# Set the index 
df.index = index_ 

# Print the DataFrame 
print(df) 


          A     B   C     D
Row_1  12.0   7.0  20  14.0
Row_2   4.0   2.0  16   3.0
Row_3   5.0  54.0  11   NaN
Row_4   NaN   3.0   3   2.0
Row_5   1.0   NaN   8   6.0


In [126]:
df=df.truncate(before='Row_2',after='Row_4')
df

Unnamed: 0,A,B,C,D
Row_2,4.0,2.0,16,3.0
Row_3,5.0,54.0,11,
Row_4,,3.0,3,2.0


## df.series.truncate()
### Syntax: Series.truncate(before=None, after=None, axis=None, copy=True)

Parameter :
before : Truncate all rows before this index value.
after : Truncate all rows after this index value.
axis : Axis to truncate. Truncates the index (rows) by default.
copy : Return a copy of the truncated section.


Returns : truncated Series or DataFrame.

In [132]:
# importing pandas as pd 
import pandas as pd 

# Creating the Series 
sr = pd.Series(['New York', 'Chicago', 'Toronto', 'Lisbon', 'Rio', 'Moscow']) 

# Create the Datetime Index 
didx = pd.date_range(start ='2014-08-01 10:00', freq ='W', 
					periods = 6, tz = 'Europe/Berlin') 

# set the index 
sr.index = didx 


# Print the series 
print(sr) 


2014-08-03 10:00:00+02:00    New York
2014-08-10 10:00:00+02:00     Chicago
2014-08-17 10:00:00+02:00     Toronto
2014-08-24 10:00:00+02:00      Lisbon
2014-08-31 10:00:00+02:00         Rio
2014-09-07 10:00:00+02:00      Moscow
Freq: W-SUN, dtype: object


In [134]:
sr.truncate(before='2014-08-17 10:00:00+02:00 ')

2014-08-17 10:00:00+02:00    Toronto
2014-08-24 10:00:00+02:00     Lisbon
2014-08-31 10:00:00+02:00        Rio
2014-09-07 10:00:00+02:00     Moscow
Freq: W-SUN, dtype: object

## Iterate Over Rows with Pandas

### Row Iteration Using iterrows()

In [135]:
# importing pandas as pd
import pandas as pd

# dictionary of lists
dict = {'name': ["aparna", "pankaj", "sudhir", "Geeku"],
		'degree': ["MBA", "BCA", "M.Tech", "MBA"],
		'score': [90, 40, 80, 98]}

# creating a dataframe from a dictionary
df = pd.DataFrame(dict)

print(df)


     name  degree  score
0  aparna     MBA     90
1  pankaj     BCA     40
2  sudhir  M.Tech     80
3   Geeku     MBA     98


In [137]:
for i, j in df.iterrows():
    print (i, j)
    print()

0 name      aparna
degree       MBA
score         90
Name: 0, dtype: object

1 name      pankaj
degree       BCA
score         40
Name: 1, dtype: object

2 name      sudhir
degree    M.Tech
score         80
Name: 2, dtype: object

3 name      Geeku
degree      MBA
score        98
Name: 3, dtype: object



In [143]:
import pandas as pd

data=pd.read_csv('nba.csv')
for i, j in data.iterrows():
    print(i,j)
    print()

0 Name         Avery Bradley
Team        Boston Celtics
Number                 0.0
Position                PG
Age                   25.0
Height                 6-2
Weight               180.0
College              Texas
Salary           7730337.0
Name: 0, dtype: object

1 Name           Jae Crowder
Team        Boston Celtics
Number                99.0
Position                SF
Age                   25.0
Height                 6-6
Weight               235.0
College          Marquette
Salary           6796117.0
Name: 1, dtype: object

2 Name             John Holland
Team           Boston Celtics
Number                   30.0
Position                   SG
Age                      27.0
Height                    6-5
Weight                  205.0
College     Boston University
Salary                    NaN
Name: 2, dtype: object

3 Name           R.J. Hunter
Team        Boston Celtics
Number                28.0
Position                SG
Age                   22.0
Height                 6-5
We

418 Name                  Enes Kanter
Team        Oklahoma City Thunder
Number                       11.0
Position                        C
Age                          24.0
Height                       6-11
Weight                      245.0
College                  Kentucky
Salary                 16407500.0
Name: 418, dtype: object

419 Name                 Mitch McGary
Team        Oklahoma City Thunder
Number                       33.0
Position                       PF
Age                          24.0
Height                       6-10
Weight                      255.0
College                  Michigan
Salary                  1463040.0
Name: 419, dtype: object

420 Name                Nazr Mohammed
Team        Oklahoma City Thunder
Number                       13.0
Position                        C
Age                          38.0
Height                       6-10
Weight                      250.0
College                  Kentucky
Salary                   222888.0
Name: 420, dtype: 

## Row Iteration Using itertuples()

In [147]:
for i in data.itertuples():
    print(i)
    print()

Pandas(Index=0, Name='Avery Bradley', Team='Boston Celtics', Number=0.0, Position='PG', Age=25.0, Height='6-2', Weight=180.0, College='Texas', Salary=7730337.0)

Pandas(Index=1, Name='Jae Crowder', Team='Boston Celtics', Number=99.0, Position='SF', Age=25.0, Height='6-6', Weight=235.0, College='Marquette', Salary=6796117.0)

Pandas(Index=2, Name='John Holland', Team='Boston Celtics', Number=30.0, Position='SG', Age=27.0, Height='6-5', Weight=205.0, College='Boston University', Salary=nan)

Pandas(Index=3, Name='R.J. Hunter', Team='Boston Celtics', Number=28.0, Position='SG', Age=22.0, Height='6-5', Weight=185.0, College='Georgia State', Salary=1148640.0)

Pandas(Index=4, Name='Jonas Jerebko', Team='Boston Celtics', Number=8.0, Position='PF', Age=29.0, Height='6-10', Weight=231.0, College=nan, Salary=5000000.0)

Pandas(Index=5, Name='Amir Johnson', Team='Boston Celtics', Number=90.0, Position='PF', Age=29.0, Height='6-9', Weight=240.0, College=nan, Salary=12000000.0)

Pandas(Index=6, Na

In [161]:

import pandas as pd

data=pd.read_csv('nba.csv')
columns = list(data)
for col_name in columns:
    print(col_name)
    print(data[col_name])
    print()

Name
0      Avery Bradley
1        Jae Crowder
2       John Holland
3        R.J. Hunter
4      Jonas Jerebko
           ...      
453     Shelvin Mack
454        Raul Neto
455     Tibor Pleiss
456      Jeff Withey
457              NaN
Name: Name, Length: 458, dtype: object

Team
0      Boston Celtics
1      Boston Celtics
2      Boston Celtics
3      Boston Celtics
4      Boston Celtics
            ...      
453         Utah Jazz
454         Utah Jazz
455         Utah Jazz
456         Utah Jazz
457               NaN
Name: Team, Length: 458, dtype: object

Number
0       0.0
1      99.0
2      30.0
3      28.0
4       8.0
       ... 
453     8.0
454    25.0
455    21.0
456    24.0
457     NaN
Name: Number, Length: 458, dtype: float64

Position
0       PG
1       SF
2       SG
3       SG
4       PF
      ... 
453     PG
454     PG
455      C
456      C
457    NaN
Name: Position, Length: 458, dtype: object

Age
0      25.0
1      25.0
2      27.0
3      22.0
4      29.0
       ... 
453  

## Pandas Dataframe.sort_values()
### Syntax: DataFrame.sort_values(by, axis=0, ascending=True, inplace=False, kind=’quicksort’, na_position=’last’)

Note: Every parameter has some default values except the ‘by’ parameter.


Parameters: 

* by: Single/List of column names to sort Data Frame by. 
* axis: 0 or ‘index’ for rows and 1 or ‘columns’ for Column. 
* ascending: Boolean value which sorts Data frame in ascending order if True. 
* inplace: Boolean value. Makes the changes in passed data frame itself if True. 
* kind: String which can have three inputs(‘quicksort’, ‘mergesort’ or ‘heapsort’) of algorithm used to sort data frame. 
* na_position: Takes two string input ‘last’ or ‘first’ to set position of Null values. Default is ‘last’.

Return Type: 

* Returns a sorted Data Frame with Same dimensions as of the function caller DataFrame.

In [174]:
# importing pandas package
import pandas as pd

# making data frame from csv file
data = pd.read_csv("nba.csv")

# data.sort_values(by='Salary',axis=0,ascending=True,inplace=True,na_position='first')

data.sort_values(by=['Team','Name'],axis=0,ascending=[True,False],inplace=True,na_position='last')

# display
data[:36]


Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
322,Walter Tavares,Atlanta Hawks,22.0,C,24.0,7-3,260.0,,1000000.0
310,Tim Hardaway Jr.,Atlanta Hawks,10.0,SG,24.0,6-6,205.0,Michigan,1304520.0
321,Tiago Splitter,Atlanta Hawks,11.0,C,31.0,6-11,245.0,,9756250.0
320,Thabo Sefolosha,Atlanta Hawks,25.0,SF,32.0,6-7,220.0,,4000000.0
315,Paul Millsap,Atlanta Hawks,4.0,PF,31.0,6-8,246.0,Louisiana Tech,18671659.0
319,Mike Scott,Atlanta Hawks,32.0,PF,27.0,6-8,237.0,Virginia,3333333.0
316,Mike Muscala,Atlanta Hawks,31.0,PF,24.0,6-11,240.0,Bucknell,947276.0
317,Lamar Patterson,Atlanta Hawks,13.0,SG,24.0,6-5,225.0,Pittsburgh,525093.0
314,Kyle Korver,Atlanta Hawks,26.0,SG,35.0,6-7,212.0,Creighton,5746479.0
313,Kris Humphries,Atlanta Hawks,43.0,PF,31.0,6-9,235.0,Minnesota,1000000.0


In [175]:
import pandas as pd

# making data frame from csv file
data = pd.read_csv("nba.csv")

# data.sort_values(by='Salary',axis=0,ascending=True,inplace=True,na_position='first')

data.sort_values(by=['Team','Age','Height'],axis=0,ascending=[False,True,False],inplace=True,na_position='last')

# display
data[:36]

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
377,Kelly Oubre Jr.,Washington Wizards,12.0,SF,20.0,6-7,205.0,Kansas,1920240.0
369,Bradley Beal,Washington Wizards,3.0,SG,22.0,6-5,207.0,Florida,5694674.0
378,Otto Porter Jr.,Washington Wizards,22.0,SF,23.0,6-8,198.0,Georgetown,4662960.0
371,Jarell Eddie,Washington Wizards,8.0,SG,24.0,6-7,218.0,Virginia Tech,561716.0
382,John Wall,Washington Wizards,2.0,PG,25.0,6-4,195.0,Kentucky,15851950.0
376,Markieff Morris,Washington Wizards,5.0,PF,26.0,6-10,245.0,Kansas,8000000.0
374,JJ Hickson,Washington Wizards,21.0,C,27.0,6-9,242.0,North Carolina State,273038.0
381,Marcus Thornton,Washington Wizards,15.0,SF,29.0,6-4,205.0,LSU,200600.0
370,Jared Dudley,Washington Wizards,1.0,SF,30.0,6-7,225.0,Boston College,4375000.0
380,Garrett Temple,Washington Wizards,17.0,SG,30.0,6-6,195.0,LSU,1100602.0


In [184]:
import pandas as pd

dict_ = {'Name':['Martha', 'Tim', 'Rob', 'Georgia'],
        'Maths':[87, 91, 97, 95],
        'Science':[83, 99, 84, 76]
       }

df1=pd.DataFrame(dict_)

df.loc[len(df.index)]=['Ammy',45,56]
# dict2={'Name':'Akash','Maths':99,'Science':77}
dict2={'Name':['Akash'],'Maths':[99],'Science':[77]}

df2=pd.DataFrame(dict2)
df=pd.concat([df1,df2],ignore_index=True)
df

Unnamed: 0,Name,Maths,Science
0,Martha,87,83
1,Tim,91,99
2,Rob,97,84
3,Georgia,95,76
4,Akash,99,77


In [180]:
from IPython.display import display, HTML

import pandas as pd
import numpy as np

dict = {'Name':['Martha', 'Tim', 'Rob', 'Georgia'],
        'Maths':[87, 91, 97, 95],
        'Science':[83, 99, 84, 76]
       }

df1 = pd.DataFrame(dict)
display(df1)

dict = {'Name':['Amy', 'Maddy'],
        'Maths':[89, 90],
        'Science':[93, 81]
       }

df2 = pd.DataFrame(dict)
display(df2)

df3 = pd.concat([df1, df2], ignore_index = True)
df3.reset_index()

display(df3)


Unnamed: 0,Name,Maths,Science
0,Martha,87,83
1,Tim,91,99
2,Rob,97,84
3,Georgia,95,76


Unnamed: 0,Name,Maths,Science
0,Amy,89,93
1,Maddy,90,81


Unnamed: 0,Name,Maths,Science
0,Martha,87,83
1,Tim,91,99
2,Rob,97,84
3,Georgia,95,76
4,Amy,89,93
5,Maddy,90,81


## Df.groupby()

Syntax :    DataFrame.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=<object object>, observed=False, **kwargs)
    
* by: This is the criteria for grouping. It could be a column name, a list of column names, or a function.
* axis: The axis along which the groupings are performed. By default, it is set to 0, which means grouping along rows.
* level: If the axis is a MultiIndex (hierarchical), the level parameter indicates the level to group by.
* as_index: If True (default), the group labels will be the indices of the resulting DataFrame. If False, the group labels will be included as a separate column.
* sort: Sort the group labels.
* group_keys: When calling apply, add group keys to index to identify pieces.
* squeeze: Reduce the dimensionality of the return type if possible, otherwise return a consistent type.
* observed: This is used when the groupers contain only unique values. If True, only the observed values will be used.
* **kwargs: Additional keyword arguments to be passed to the groupby function.


In [16]:
# importing pandas module
import pandas as pd 

# Define a dictionary containing employee data 
data1 = {'Name':['Jai', 'Anuj', 'Jai', 'Princi', 
				'Gaurav', 'Anuj', 'Princi', 'Abhi'], 
		'Age':[27, 24, 22, 32, 
			33, 36, 27, 32], 
		'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj',
				'Jaunpur', 'Kanpur', 'Allahabad', 'Aligarh'], 
		'Qualification':['Msc', 'MA', 'MCA', 'Phd',
						'B.Tech', 'B.com', 'Msc', 'MA']} 
	

# Convert the dictionary into DataFrame 
df = pd.DataFrame(data1)

print(df) 


     Name  Age    Address Qualification
0     Jai   27     Nagpur           Msc
1    Anuj   24     Kanpur            MA
2     Jai   22  Allahabad           MCA
3  Princi   32    Kannuaj           Phd
4  Gaurav   33    Jaunpur        B.Tech
5    Anuj   36     Kanpur         B.com
6  Princi   27  Allahabad           Msc
7    Abhi   32    Aligarh            MA


In [18]:
df.groupby('Name')
print(df.groupby('Name').groups)


{'Abhi': [7], 'Anuj': [1, 5], 'Gaurav': [4], 'Jai': [0, 2], 'Princi': [3, 6]}


In [20]:
gk=df.groupby('Name')
gk.first()

Unnamed: 0_level_0,Age,Address,Qualification
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Abhi,32,Aligarh,MA
Anuj,24,Kanpur,MA
Gaurav,33,Jaunpur,B.Tech
Jai,27,Nagpur,Msc
Princi,32,Kannuaj,Phd


In [21]:
# importing pandas module
import pandas as pd 

# Define a dictionary containing employee data 
data1 = {'Name':['Jai', 'Anuj', 'Jai', 'Princi', 
				'Gaurav', 'Anuj', 'Princi', 'Abhi'], 
		'Age':[27, 24, 22, 32, 
			33, 36, 27, 32], 
		'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj',
				'Jaunpur', 'Kanpur', 'Allahabad', 'Aligarh'], 
		'Qualification':['Msc', 'MA', 'MCA', 'Phd',
						'B.Tech', 'B.com', 'Msc', 'MA']} 
	

# Convert the dictionary into DataFrame 
df = pd.DataFrame(data1)

print(df) 


     Name  Age    Address Qualification
0     Jai   27     Nagpur           Msc
1    Anuj   24     Kanpur            MA
2     Jai   22  Allahabad           MCA
3  Princi   32    Kannuaj           Phd
4  Gaurav   33    Jaunpur        B.Tech
5    Anuj   36     Kanpur         B.com
6  Princi   27  Allahabad           Msc
7    Abhi   32    Aligarh            MA


In [24]:
print(df.groupby(['Name','Qualification']).groups)

{('Abhi', 'MA'): [7], ('Anuj', 'B.com'): [5], ('Anuj', 'MA'): [1], ('Gaurav', 'B.Tech'): [4], ('Jai', 'MCA'): [2], ('Jai', 'Msc'): [0], ('Princi', 'Msc'): [6], ('Princi', 'Phd'): [3]}


In [26]:
print(df.groupby('Name').sum())

        Age           Address Qualification
Name                                       
Abhi     32           Aligarh            MA
Anuj     60      KanpurKanpur       MAB.com
Gaurav   33           Jaunpur        B.Tech
Jai      49   NagpurAllahabad        MscMCA
Princi   59  KannuajAllahabad        PhdMsc


In [29]:
print(df.groupby('Name',sort=False).sum())

        Age           Address Qualification
Name                                       
Jai      49   NagpurAllahabad        MscMCA
Anuj     60      KanpurKanpur       MAB.com
Princi   59  KannuajAllahabad        PhdMsc
Gaurav   33           Jaunpur        B.Tech
Abhi     32           Aligarh            MA


In [33]:
grp=df.groupby(['Name','Qualification'])

for name, group in grp:
    print(name)
    print(group)
    print()

('Abhi', 'MA')
   Name  Age  Address Qualification
7  Abhi   32  Aligarh            MA

('Anuj', 'B.com')
   Name  Age Address Qualification
5  Anuj   36  Kanpur         B.com

('Anuj', 'MA')
   Name  Age Address Qualification
1  Anuj   24  Kanpur            MA

('Gaurav', 'B.Tech')
     Name  Age  Address Qualification
4  Gaurav   33  Jaunpur        B.Tech

('Jai', 'MCA')
  Name  Age    Address Qualification
2  Jai   22  Allahabad           MCA

('Jai', 'Msc')
  Name  Age Address Qualification
0  Jai   27  Nagpur           Msc

('Princi', 'Msc')
     Name  Age    Address Qualification
6  Princi   27  Allahabad           Msc

('Princi', 'Phd')
     Name  Age  Address Qualification
3  Princi   32  Kannuaj           Phd



In [35]:
grp=df.groupby('Name')

for name, group in grp:
    print(name)
    print(group)
    print()

Abhi
   Name  Age  Address Qualification
7  Abhi   32  Aligarh            MA

Anuj
   Name  Age Address Qualification
1  Anuj   24  Kanpur            MA
5  Anuj   36  Kanpur         B.com

Gaurav
     Name  Age  Address Qualification
4  Gaurav   33  Jaunpur        B.Tech

Jai
  Name  Age    Address Qualification
0  Jai   27     Nagpur           Msc
2  Jai   22  Allahabad           MCA

Princi
     Name  Age    Address Qualification
3  Princi   32    Kannuaj           Phd
6  Princi   27  Allahabad           Msc



In [36]:
print(df.groupby('Name').get_group('Anuj'))

   Name  Age Address Qualification
1  Anuj   24  Kanpur            MA
5  Anuj   36  Kanpur         B.com


In [40]:
print(df.groupby(['Name','Qualification']).get_group(('Anuj','MA')))

   Name  Age Address Qualification
1  Anuj   24  Kanpur            MA


In [47]:
# importing pandas module
import pandas as pd 

# importing numpy as np
import numpy as np

# Define a dictionary containing employee data 
data1 = {'Name':['Jai', 'Anuj', 'Jai', 'Princi', 
				'Gaurav', 'Anuj', 'Princi', 'Abhi'], 
		'Age':[27, 24, 22, 32, 
			33, 36, 27, 32], 
		'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj',
				'Jaunpur', 'Kanpur', 'Allahabad', 'Aligarh'], 
		'Qualification':['Msc', 'MA', 'MCA', 'Phd',
						'B.Tech', 'B.com', 'Msc', 'MA']} 
	

# Convert the dictionary into DataFrame 
df = pd.DataFrame(data1)

print(df) 


     Name  Age    Address Qualification
0     Jai   27     Nagpur           Msc
1    Anuj   24     Kanpur            MA
2     Jai   22  Allahabad           MCA
3  Princi   32    Kannuaj           Phd
4  Gaurav   33    Jaunpur        B.Tech
5    Anuj   36     Kanpur         B.com
6  Princi   27  Allahabad           Msc
7    Abhi   32    Aligarh            MA


### df.groupby().Aggregation()

In [51]:
grpfunc1=df.groupby('Name')
grpfunc1.aggregate(np.sum)

Unnamed: 0_level_0,Age,Address,Qualification
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Abhi,32,Aligarh,MA
Anuj,60,KanpurKanpur,MAB.com
Gaurav,33,Jaunpur,B.Tech
Jai,49,NagpurAllahabad,MscMCA
Princi,59,KannuajAllahabad,PhdMsc


In [52]:
grpfunc1=df.groupby(['Name','Qualification'])
grpfunc1.aggregate(np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Address
Name,Qualification,Unnamed: 2_level_1,Unnamed: 3_level_1
Abhi,MA,32,Aligarh
Anuj,B.com,36,Kanpur
Anuj,MA,24,Kanpur
Gaurav,B.Tech,33,Jaunpur
Jai,MCA,22,Allahabad
Jai,Msc,27,Nagpur
Princi,Msc,27,Allahabad
Princi,Phd,32,Kannuaj


In [56]:
grpfunc1=df.groupby('Name')
grpfunc1['Age'].aggregate([np.sum,np.std,np.sum])

Unnamed: 0_level_0,sum,std,sum
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Abhi,32,,32
Anuj,60,8.485281,60
Gaurav,33,,33
Jai,49,3.535534,49
Princi,59,3.535534,59


In [57]:
# importing pandas module
import pandas as pd 

# importing numpy as np
import numpy as np

# Define a dictionary containing employee data 
data1 = {'Name':['Jai', 'Anuj', 'Jai', 'Princi', 
				'Gaurav', 'Anuj', 'Princi', 'Abhi'], 
		'Age':[27, 24, 22, 32, 
			33, 36, 27, 32], 
		'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj',
				'Jaunpur', 'Kanpur', 'Allahabad', 'Aligarh'], 
		'Qualification':['Msc', 'MA', 'MCA', 'Phd',
						'B.Tech', 'B.com', 'Msc', 'MA'],
		'Score': [23, 34, 35, 45, 47, 50, 52, 53]} 
	

# Convert the dictionary into DataFrame 
df = pd.DataFrame(data1)

print(df) 


     Name  Age    Address Qualification  Score
0     Jai   27     Nagpur           Msc     23
1    Anuj   24     Kanpur            MA     34
2     Jai   22  Allahabad           MCA     35
3  Princi   32    Kannuaj           Phd     45
4  Gaurav   33    Jaunpur        B.Tech     47
5    Anuj   36     Kanpur         B.com     50
6  Princi   27  Allahabad           Msc     52
7    Abhi   32    Aligarh            MA     53


In [58]:
grpfunc1=df.groupby('Name')
grpfunc1.aggregate({'Age':'sum','Score':'std' })

Unnamed: 0_level_0,Age,Score
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Abhi,32,
Anuj,60,11.313708
Gaurav,33,
Jai,49,8.485281
Princi,59,4.949747


## df.groupby().tranformation()

In [64]:
# importing pandas module
import pandas as pd 

# importing numpy as np
import numpy as np

# Define a dictionary containing employee data 
data1 = {'Name':['Jai', 'Anuj', 'Jai', 'Princi', 
				'Gaurav', 'Anuj', 'Princi', 'Abhi'], 
		'Age':[27, 24, 22, 32, 
			33, 36, 27, 32], 
		'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj',
				'Jaunpur', 'Kanpur', 'Allahabad', 'Aligarh'], 
		'Qualification':['Msc', 'MA', 'MCA', 'Phd',
						'B.Tech', 'B.com', 'Msc', 'MA'],
		'Score': [23, 34, 35, 45, 47, 50, 52, 53]} 
	

# Convert the dictionary into DataFrame 
df = pd.DataFrame(data1)

print(df) 


     Name  Age    Address Qualification  Score
0     Jai   27     Nagpur           Msc     23
1    Anuj   24     Kanpur            MA     34
2     Jai   22  Allahabad           MCA     35
3  Princi   32    Kannuaj           Phd     45
4  Gaurav   33    Jaunpur        B.Tech     47
5    Anuj   36     Kanpur         B.com     50
6  Princi   27  Allahabad           Msc     52
7    Abhi   32    Aligarh            MA     53


In [63]:
grptransform=df.groupby('Name')
sc=lambda x: (x-x.mean())/x.std()*10 if x.dtypes in ['int64','float64'] else x
print(grptransform.transform(sc))

        Age    Address Qualification     Score
0  7.071068     Nagpur           Msc -7.071068
1 -7.071068     Kanpur            MA -7.071068
2 -7.071068  Allahabad           MCA  7.071068
3  7.071068    Kannuaj           Phd -7.071068
4       NaN    Jaunpur        B.Tech       NaN
5  7.071068     Kanpur         B.com  7.071068
6 -7.071068  Allahabad           Msc  7.071068
7       NaN    Aligarh            MA       NaN


In [71]:
grptransform=df.groupby('Name')
sc=lambda x: (x-x.mean())/x.std()*10 
print(grptransform[['Age','Score']].transform(sc))

        Age     Score
0  7.071068 -7.071068
1 -7.071068 -7.071068
2 -7.071068  7.071068
3  7.071068 -7.071068
4       NaN       NaN
5  7.071068  7.071068
6 -7.071068  7.071068
7       NaN       NaN


## df.groupby().filteration()

In [72]:
# importing pandas module
import pandas as pd 

# importing numpy as np
import numpy as np

# Define a dictionary containing employee data 
data1 = {'Name':['Jai', 'Anuj', 'Jai', 'Princi', 
				'Gaurav', 'Anuj', 'Princi', 'Abhi'], 
		'Age':[27, 24, 22, 32, 
			33, 36, 27, 32], 
		'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj',
				'Jaunpur', 'Kanpur', 'Allahabad', 'Aligarh'], 
		'Qualification':['Msc', 'MA', 'MCA', 'Phd',
						'B.Tech', 'B.com', 'Msc', 'MA'],
		'Score': [23, 34, 35, 45, 47, 50, 52, 53]} 
	

# Convert the dictionary into DataFrame 
df = pd.DataFrame(data1)

print(df) 


     Name  Age    Address Qualification  Score
0     Jai   27     Nagpur           Msc     23
1    Anuj   24     Kanpur            MA     34
2     Jai   22  Allahabad           MCA     35
3  Princi   32    Kannuaj           Phd     45
4  Gaurav   33    Jaunpur        B.Tech     47
5    Anuj   36     Kanpur         B.com     50
6  Princi   27  Allahabad           Msc     52
7    Abhi   32    Aligarh            MA     53


In [73]:
df.groupby('Name').groups

{'Abhi': [7], 'Anuj': [1, 5], 'Gaurav': [4], 'Jai': [0, 2], 'Princi': [3, 6]}

In [78]:
grpfilter=df.groupby('Name')
func=lambda x: len(x)<2
grpfilter.filter(func)

Unnamed: 0,Name,Age,Address,Qualification,Score
4,Gaurav,33,Jaunpur,B.Tech,47
7,Abhi,32,Aligarh,MA,53


### group in rows

In [79]:
# importing Pandas 
import pandas as pd 

# example dataframe 
example = {'Team':['Arsenal', 'Manchester United', 'Arsenal', 
				'Arsenal', 'Chelsea', 'Manchester United', 
				'Manchester United', 'Chelsea', 'Chelsea', 'Chelsea'], 
					
		'Player':['Ozil', 'Pogba', 'Lucas', 'Aubameyang', 
					'Hazard', 'Mata', 'Lukaku', 'Morata', 
										'Giroud', 'Kante'], 
										
		'Goals':[5, 3, 6, 4, 9, 2, 0, 5, 2, 3] } 

df = pd.DataFrame(example) 

print(df) 


                Team      Player  Goals
0            Arsenal        Ozil      5
1  Manchester United       Pogba      3
2            Arsenal       Lucas      6
3            Arsenal  Aubameyang      4
4            Chelsea      Hazard      9
5  Manchester United        Mata      2
6  Manchester United      Lukaku      0
7            Chelsea      Morata      5
8            Chelsea      Giroud      2
9            Chelsea       Kante      3


In [85]:
total_goal=df['Goals'].groupby(df['Team'])
total_goal.mean()

Team
Arsenal              5.000000
Chelsea              4.750000
Manchester United    1.666667
Name: Goals, dtype: float64

In [90]:
total=df.groupby('Team').groups
# total_goal.mean()

In [97]:
total=df.groupby('Team')
func=lambda x: x.mean() if x.dtypes in ['int64','float64'] else x
total.aggregate(func)

Unnamed: 0_level_0,Player,Goals
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
Arsenal,"[Ozil, Lucas, Aubameyang]",5.0
Chelsea,"[Hazard, Morata, Giroud, Kante]",4.75
Manchester United,"[Pogba, Mata, Lukaku]",1.666667


In [99]:
import pandas as pd 

# example dataframe 
example = {'Team':['Australia', 'England', 'South Africa', 
				'Australia', 'England', 'India', 'India', 
						'South Africa', 'England', 'India'], 
						
		'Player':['Ricky Ponting', 'Joe Root', 'Hashim Amla', 
					'David Warner', 'Jos Buttler', 'Virat Kohli', 
					'Rohit Sharma', 'David Miller', 'Eoin Morgan', 
												'Dinesh Karthik'], 
												
		'Runs':[345, 336, 689, 490, 989, 672, 560, 455, 342, 376], 
			
		'Salary':[34500, 33600, 68900, 49000, 98899, 
					67562, 56760, 45675, 34542, 31176] } 

df = pd.DataFrame(example) 

df


Unnamed: 0,Team,Player,Runs,Salary
0,Australia,Ricky Ponting,345,34500
1,England,Joe Root,336,33600
2,South Africa,Hashim Amla,689,68900
3,Australia,David Warner,490,49000
4,England,Jos Buttler,989,98899
5,India,Virat Kohli,672,67562
6,India,Rohit Sharma,560,56760
7,South Africa,David Miller,455,45675
8,England,Eoin Morgan,342,34542
9,India,Dinesh Karthik,376,31176


In [100]:
total=df.groupby('Team')

In [102]:
func=lambda x: x.mean() if x.dtypes in ['int64','float64'] else x
total['Salary'].aggregate(func)

Team
Australia       41750.000000
England         55680.333333
India           51832.666667
South Africa    57287.500000
Name: Salary, dtype: float64

In [103]:
# importing pandas as pd 
import pandas as pd 

# Creating a dictionary 
d = {'id':['1', '2', '3'], 
	'Column 1.1':[14, 15, 16], 
	'Column 1.2':[10, 10, 10], 
	'Column 1.3':[1, 4, 5], 
	'Column 2.1':[1, 2, 3], 
	'Column 2.2':[10, 10, 10], } 

# Converting dictionary into a data-frame 
df = pd.DataFrame(d) 
print(df) 


  id  Column 1.1  Column 1.2  Column 1.3  Column 2.1  Column 2.2
0  1          14          10           1           1          10
1  2          15          10           4           2          10
2  3          16          10           5           3          10


In [111]:
groupby_dict = {'Column 1.1':'Column 1', 
                'Column 1.2':'Column 1', 
                'Column 1.3':'Column 1', 
                'Column 2.1':'Column 2', 
                'Column 2.2':'Column 2' } 
  
df.set_index('id')
df.groupby(by=groupby_dict,axis=1).min()

Unnamed: 0,Column 1,Column 2
0,1,1
1,4,2
2,5,3


In [2]:
# importing pandas as pd 
import pandas as pd 

# Create dictionary with data 
dict = { 
	"ID":[1, 2, 3], 
	"Movies":["The Godfather", "Fight Club", "Casablanca"], 
	"Week_1_Viewers":[30, 30, 40], 
	"Week_2_Viewers":[60, 40, 80], 
	"Week_3_Viewers":[40, 20, 20] }; 

# Convert dictionary to dataframe 
df = pd.DataFrame(dict); 
print(df) 


   ID         Movies  Week_1_Viewers  Week_2_Viewers  Week_3_Viewers
0   1  The Godfather              30              60              40
1   2     Fight Club              30              40              20
2   3     Casablanca              40              80              20


In [5]:
groupby_dict={
    'Week_1_Viewers':'Total',
    'Week_2_Viewers':'Total'
}

df.groupby(groupby_dict,axis=1).sum()


Unnamed: 0,Total
0,90
1,70
2,120


## pd.concat()
### pd.concat(objs, axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, sort=False, copy=True)


* objs: This is a sequence or mapping of Series, DataFrame, or Panel objects to be concatenated. It can be a list, tuple, or dictionary.

* axis: The axis along which the objects will be concatenated. By default, it's 0 (concatenating along rows). If axis=1, concatenation will be along columns.

* join: Specifies how to handle indexes on other axes. It can be 'outer' (default) or 'inner'. 'outer' performs a union, 'inner' performs an intersection.

* ignore_index: If True, do not use the index values along the concatenation axis. The resulting axis will be labeled 0, 1, …, n - 1.

* keys: This is used to create a hierarchical index on the concatenation axis. It's a sequence of labels to use to construct a MultiIndex.

* levels: Specific levels (unique values) to use for constructing a MultiIndex. Otherwise, they will be inferred from the keys.

* names: Names for the levels in the resulting hierarchical index.

* verify_integrity: If True, check whether the new concatenated axis contains duplicates. If duplicates are found, raise a ValueError.

* Sort: Sort the resulting DataFrame by the index along the concatenation axis. The default is False.

* copy: If False, do not copy data unnecessarily.

In [6]:
# importing pandas module
import pandas as pd 
 
# Define a dictionary containing employee data 
data1 = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'], 
        'Age':[27, 24, 22, 32], 
        'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'], 
        'Qualification':['Msc', 'MA', 'MCA', 'Phd']} 
   
# Define a dictionary containing employee data 
data2 = {'Name':['Abhi', 'Ayushi', 'Dhiraj', 'Hitesh'], 
        'Age':[17, 14, 12, 52], 
        'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'], 
        'Qualification':['Btech', 'B.A', 'Bcom', 'B.hons']} 
 
# Convert the dictionary into DataFrame  
df = pd.DataFrame(data1,index=[0, 1, 2, 3])
 
# Convert the dictionary into DataFrame  
df1 = pd.DataFrame(data2, index=[4, 5, 6, 7])
 
print(df, "\n\n", df1) 

     Name  Age    Address Qualification
0     Jai   27     Nagpur           Msc
1  Princi   24     Kanpur            MA
2  Gaurav   22  Allahabad           MCA
3    Anuj   32    Kannuaj           Phd 

      Name  Age    Address Qualification
4    Abhi   17     Nagpur         Btech
5  Ayushi   14     Kanpur           B.A
6  Dhiraj   12  Allahabad          Bcom
7  Hitesh   52    Kannuaj        B.hons


In [7]:
df=pd.concat([df,df1])

In [8]:
df

Unnamed: 0,Name,Age,Address,Qualification
0,Jai,27,Nagpur,Msc
1,Princi,24,Kanpur,MA
2,Gaurav,22,Allahabad,MCA
3,Anuj,32,Kannuaj,Phd
4,Abhi,17,Nagpur,Btech
5,Ayushi,14,Kanpur,B.A
6,Dhiraj,12,Allahabad,Bcom
7,Hitesh,52,Kannuaj,B.hons


In [38]:
# importing pandas module
import pandas as pd 
 
# Define a dictionary containing employee data 
data1 = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'], 
        'Age':[27, 24, 22, 32], 
        'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'], 
        'Qualification':['Msc', 'MA', 'MCA', 'Phd'],
        'Mobile No': [97, 91, 58, 76]} 
   
# Define a dictionary containing employee data 
data2 = {'Name':['Gaurav', 'Anuj', 'Dhiraj', 'Hitesh'], 
        'Age':[22, 32, 12, 52], 
        'Address':['Allahabad', 'Kannuaj', 'Allahabad', 'Kannuaj'], 
        'Qualification':['MCA', 'Phd', 'Bcom', 'B.hons'],
        'Salary':[1000, 2000, 3000, 4000]} 
 
# Convert the dictionary into DataFrame  
df = pd.DataFrame(data1,index=[0, 1, 2, 3])
 
# Convert the dictionary into DataFrame  
df1 = pd.DataFrame(data2, index=[2, 3, 6, 7]) 
 
print(df, "\n\n", df1) 

     Name  Age    Address Qualification  Mobile No
0     Jai   27     Nagpur           Msc         97
1  Princi   24     Kanpur            MA         91
2  Gaurav   22  Allahabad           MCA         58
3    Anuj   32    Kannuaj           Phd         76 

      Name  Age    Address Qualification  Salary
2  Gaurav   22  Allahabad           MCA    1000
3    Anuj   32    Kannuaj           Phd    2000
6  Dhiraj   12  Allahabad          Bcom    3000
7  Hitesh   52    Kannuaj        B.hons    4000


In [28]:
df=pd.concat([df,df1],axis=1,join='inner',sort=False)
df

Unnamed: 0,Name,Age,Address,Qualification,Mobile No,Name.1,Age.1,Address.1,Qualification.1,Salary,...,Name.2,Age.2,Address.2,Qualification.2,Salary.1,Name.3,Age.3,Address.3,Qualification.3,Salary.2
2,Gaurav,22,Allahabad,MCA,58,Gaurav,22,Allahabad,MCA,1000,...,Gaurav,22,Allahabad,MCA,1000,Gaurav,22,Allahabad,MCA,1000
3,Anuj,32,Kannuaj,Phd,76,Anuj,32,Kannuaj,Phd,2000,...,Anuj,32,Kannuaj,Phd,2000,Anuj,32,Kannuaj,Phd,2000


In [34]:
df=pd.concat([df,df1],ignore_index=True)
df

Unnamed: 0,Name,Age,Address,Qualification,Mobile No,Salary
0,Jai,27,Nagpur,Msc,97.0,
1,Princi,24,Kanpur,MA,91.0,
2,Gaurav,22,Allahabad,MCA,58.0,
3,Anuj,32,Kannuaj,Phd,76.0,
4,Gaurav,22,Allahabad,MCA,,1000.0
5,Anuj,32,Kannuaj,Phd,,2000.0
6,Dhiraj,12,Allahabad,Bcom,,3000.0
7,Hitesh,52,Kannuaj,B.hons,,4000.0


In [39]:
df=pd.concat([df,df1],ignore_index=False, keys=['X','Y'])
df

Unnamed: 0,Unnamed: 1,Name,Age,Address,Qualification,Mobile No,Salary
X,0,Jai,27,Nagpur,Msc,97.0,
X,1,Princi,24,Kanpur,MA,91.0,
X,2,Gaurav,22,Allahabad,MCA,58.0,
X,3,Anuj,32,Kannuaj,Phd,76.0,
Y,2,Gaurav,22,Allahabad,MCA,,1000.0
Y,3,Anuj,32,Kannuaj,Phd,,2000.0
Y,6,Dhiraj,12,Allahabad,Bcom,,3000.0
Y,7,Hitesh,52,Kannuaj,B.hons,,4000.0


In [49]:
# importing pandas module
import pandas as pd 
 
# Define a dictionary containing employee data 
data1 = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'], 
        'Age':[27, 24, 22, 32], 
        'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'], 
        'Qualification':['Msc', 'MA', 'MCA', 'Phd']} 
   
# Convert the dictionary into DataFrame  
df = pd.DataFrame(data1,index=[0, 1, 2, 3])
 
# creating a series
s1 = pd.Series([1000, 2000, 3000, 4000],name='Salary')
 
print(df, "\n\n", s1) 

     Name  Age    Address Qualification
0     Jai   27     Nagpur           Msc
1  Princi   24     Kanpur            MA
2  Gaurav   22  Allahabad           MCA
3    Anuj   32    Kannuaj           Phd 

 0    1000
1    2000
2    3000
3    4000
Name: Salary, dtype: int64


In [50]:
df=pd.concat([df,s1],axis=1)

In [51]:
df

Unnamed: 0,Name,Age,Address,Qualification,Salary
0,Jai,27,Nagpur,Msc,1000
1,Princi,24,Kanpur,MA,2000
2,Gaurav,22,Allahabad,MCA,3000
3,Anuj,32,Kannuaj,Phd,4000


## pd.merge()

### pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=True, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None)

* left: The left DataFrame to be merged.

* right: The right DataFrame to be merged.

* how: The type of merge to be performed. It can be 'left', 'right', 'outer', or 'inner'. The default is 'inner'.

* on: The column or index level names to join on. This must be found in both DataFrames. If not specified and left_index and right_index are False, the intersection of the columns in both DataFrames will be used.

* left_on: Columns or index levels from the left DataFrame to use as keys. Can also be the column names or lists of column names.

* right_on: Columns or index levels from the right DataFrame to use as keys. Can also be the column names or lists of column names.

* left_index: If True, use the index (row labels) from the left DataFrame as its join key(s).

* right_index: If True, use the index (row labels) from the right DataFrame as its join key(s).

* sort: Sort the result DataFrame by the join keys in lexicographical order. Default is True.

* suffixes: A tuple of string suffixes to apply to overlapping column names. Default is ('_x', '_y').

* copy: If False, avoid copying data unnecessarily. Default is True.

* indicator: If True, adds a special column '_merge' to the output DataFrame with information on the source of each row.

* validate: If specified, checks if merge is of specified type. Options are 'one_to_one', 'one_to_many', 'many_to_one', or 'many_to_many'.


In [1]:
import pandas as pd 
 
# Define a dictionary containing employee data 
data1 = {'key': ['K0', 'K1', 'K2', 'K3'],
         'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'], 
        'Age':[27, 24, 22, 32],} 
   
# Define a dictionary containing employee data 
data2 = {'key': ['K0', 'K1', 'K2', 'K3'],
         'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'], 
        'Qualification':['Btech', 'B.A', 'Bcom', 'B.hons']} 
 
# Convert the dictionary into DataFrame  
df = pd.DataFrame(data1)
 
# Convert the dictionary into DataFrame  
df1 = pd.DataFrame(data2) 
  
 
print(df, "\n\n", df1) 

  key    Name  Age
0  K0     Jai   27
1  K1  Princi   24
2  K2  Gaurav   22
3  K3    Anuj   32 

   key    Address Qualification
0  K0     Nagpur         Btech
1  K1     Kanpur           B.A
2  K2  Allahabad          Bcom
3  K3    Kannuaj        B.hons


In [2]:
res=pd.merge(df,df1, on='key')
res

Unnamed: 0,key,Name,Age,Address,Qualification
0,K0,Jai,27,Nagpur,Btech
1,K1,Princi,24,Kanpur,B.A
2,K2,Gaurav,22,Allahabad,Bcom
3,K3,Anuj,32,Kannuaj,B.hons


In [3]:
import pandas as pd 
 
# Define a dictionary containing employee data 
data1 = {'key': ['K0', 'K1', 'K2', 'K3'],
         'key1': ['K0', 'K1', 'K0', 'K1'],
         'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'], 
        'Age':[27, 24, 22, 32],} 
   
# Define a dictionary containing employee data 
data2 = {'key': ['K0', 'K1', 'K2', 'K3'],
         'key1': ['K0', 'K0', 'K0', 'K0'],
         'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'], 
        'Qualification':['Btech', 'B.A', 'Bcom', 'B.hons']} 
 
# Convert the dictionary into DataFrame  
df = pd.DataFrame(data1)
 
# Convert the dictionary into DataFrame  
df1 = pd.DataFrame(data2) 
  
 
print(df, "\n\n", df1) 

  key key1    Name  Age
0  K0   K0     Jai   27
1  K1   K1  Princi   24
2  K2   K0  Gaurav   22
3  K3   K1    Anuj   32 

   key key1    Address Qualification
0  K0   K0     Nagpur         Btech
1  K1   K0     Kanpur           B.A
2  K2   K0  Allahabad          Bcom
3  K3   K0    Kannuaj        B.hons


In [4]:
res1 = pd.merge(df, df1, on=['key', 'key1'])
 
res1

Unnamed: 0,key,key1,Name,Age,Address,Qualification
0,K0,K0,Jai,27,Nagpur,Btech
1,K2,K0,Gaurav,22,Allahabad,Bcom


In [5]:
import pandas as pd 
 
# Define a dictionary containing employee data 
data1 = {'key': ['K0', 'K1', 'K2', 'K3'],
         'key1': ['K0', 'K1', 'K0', 'K1'],
         'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'], 
        'Age':[27, 24, 22, 32],} 
   
# Define a dictionary containing employee data 
data2 = {'key': ['K0', 'K1', 'K2', 'K3'],
         'key1': ['K0', 'K0', 'K0', 'K0'],
         'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'], 
        'Qualification':['Btech', 'B.A', 'Bcom', 'B.hons']} 
 
# Convert the dictionary into DataFrame  
df = pd.DataFrame(data1)
 
# Convert the dictionary into DataFrame  
df1 = pd.DataFrame(data2) 
  
 
print(df, "\n\n", df1) 

  key key1    Name  Age
0  K0   K0     Jai   27
1  K1   K1  Princi   24
2  K2   K0  Gaurav   22
3  K3   K1    Anuj   32 

   key key1    Address Qualification
0  K0   K0     Nagpur         Btech
1  K1   K0     Kanpur           B.A
2  K2   K0  Allahabad          Bcom
3  K3   K0    Kannuaj        B.hons


In [6]:
res=pd.merge(df,df1,how='left',on=['key','key1'])
res

Unnamed: 0,key,key1,Name,Age,Address,Qualification
0,K0,K0,Jai,27,Nagpur,Btech
1,K1,K1,Princi,24,,
2,K2,K0,Gaurav,22,Allahabad,Bcom
3,K3,K1,Anuj,32,,


In [7]:
res=pd.merge(df,df1,how='right',on=['key','key1'])
res

Unnamed: 0,key,key1,Name,Age,Address,Qualification
0,K0,K0,Jai,27.0,Nagpur,Btech
1,K1,K0,,,Kanpur,B.A
2,K2,K0,Gaurav,22.0,Allahabad,Bcom
3,K3,K0,,,Kannuaj,B.hons


In [8]:
res=pd.merge(df,df1,how='outer',on=['key','key1'])
res

Unnamed: 0,key,key1,Name,Age,Address,Qualification
0,K0,K0,Jai,27.0,Nagpur,Btech
1,K1,K1,Princi,24.0,,
2,K2,K0,Gaurav,22.0,Allahabad,Bcom
3,K3,K1,Anuj,32.0,,
4,K1,K0,,,Kanpur,B.A
5,K3,K0,,,Kannuaj,B.hons


In [9]:
res=pd.merge(df,df1,how='inner',on=['key','key1'])
res

Unnamed: 0,key,key1,Name,Age,Address,Qualification
0,K0,K0,Jai,27,Nagpur,Btech
1,K2,K0,Gaurav,22,Allahabad,Bcom


## df.join()

### DataFrame.join(other, on=None, how='left', lsuffix='', rsuffix='', sort=False)

* other: The DataFrame or Series to be joined.

* on: Column or index level names to join on. Must be found in both DataFrames. If not specified, the join will be based on the index.

* how: Type of join to be performed. It can be 'left', 'right', 'outer', or 'inner'. Default is 'left'.

* lsuffix: Suffix to be added to the column names of the left DataFrame. Default is an empty string.

* rsuffix: Suffix to be added to the column names of the right DataFrame. Default is an empty string.

* sort: Sort the resulting DataFrame by the join keys in lexicographical order. Default is False.

In [10]:
import pandas as pd 
  
# Define a dictionary containing employee data 
data1 = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'], 
        'Age':[27, 24, 22, 32]} 
    
# Define a dictionary containing employee data 
data2 = {'Address':['Allahabad', 'Kannuaj', 'Allahabad', 'Kannuaj'], 
        'Qualification':['MCA', 'Phd', 'Bcom', 'B.hons']} 
  
# Convert the dictionary into DataFrame  
df = pd.DataFrame(data1,index=['K0', 'K1', 'K2', 'K3'])
  
# Convert the dictionary into DataFrame  
df1 = pd.DataFrame(data2, index=['K0', 'K2', 'K3', 'K4'])
 
 
print(df, "\n\n", df1)  

      Name  Age
K0     Jai   27
K1  Princi   24
K2  Gaurav   22
K3    Anuj   32 

       Address Qualification
K0  Allahabad           MCA
K2    Kannuaj           Phd
K3  Allahabad          Bcom
K4    Kannuaj        B.hons


In [12]:
res=df.join(df1)
res

Unnamed: 0,Name,Age,Address,Qualification
K0,Jai,27,Allahabad,MCA
K1,Princi,24,,
K2,Gaurav,22,Kannuaj,Phd
K3,Anuj,32,Allahabad,Bcom


In [13]:
res=df.join(df1, how='outer')
res

Unnamed: 0,Name,Age,Address,Qualification
K0,Jai,27.0,Allahabad,MCA
K1,Princi,24.0,,
K2,Gaurav,22.0,Kannuaj,Phd
K3,Anuj,32.0,Allahabad,Bcom
K4,,,Kannuaj,B.hons


In [18]:
import pandas as pd 
  
# Define a dictionary containing employee data 
data1 = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'], 
        'Age':[27, 24, 22, 32],
        'Key':['K0', 'K1', 'K2', 'K3']} 
    
# Define a dictionary containing employee data 
data2 = {'Address':['Allahabad', 'Kannuaj', 'Allahabad', 'Kannuaj'], 
        'Qualification':['MCA', 'Phd', 'Bcom', 'B.hons']} 
  
# Convert the dictionary into DataFrame  
df = pd.DataFrame(data1)
  
# Convert the dictionary into DataFrame  
df1 = pd.DataFrame(data2, index=['K0', 'K2', 'K3', 'K4'])
 
 
print(df, "\n\n", df1) 
res=df.join(df1,on='Key')
res

     Name  Age Key
0     Jai   27  K0
1  Princi   24  K1
2  Gaurav   22  K2
3    Anuj   32  K3 

       Address Qualification
K0  Allahabad           MCA
K2    Kannuaj           Phd
K3  Allahabad          Bcom
K4    Kannuaj        B.hons


Unnamed: 0,Name,Age,Key,Address,Qualification
0,Jai,27,K0,Allahabad,MCA
1,Princi,24,K1,,
2,Gaurav,22,K2,Kannuaj,Phd
3,Anuj,32,K3,Allahabad,Bcom


In [24]:
# importing pandas module
import pandas as pd 
  
# Define a dictionary containing employee data 
data1 = {'Name':['Jai', 'Princi', 'Gaurav'], 
        'Age':[27, 24, 22]} 
    
# Define a dictionary containing employee data 
data2 = {'Address':['Allahabad', 'Kannuaj', 'Allahabad', 'Kanpur'], 
        'Qualification':['MCA', 'Phd', 'Bcom', 'B.hons']} 
  
# Convert the dictionary into DataFrame  
df = pd.DataFrame(data1, index=pd.Index(['K0', 'K1', 'K2'], name='key'))
 
index = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'),('K2', 'Y2'), ('K2', 'Y3')],names=['key', 'Y'])
  
# Convert the dictionary into DataFrame  
df1 = pd.DataFrame(data2, index= index)
 
 
print(df, "\n\n", df1)

       Name  Age
key             
K0      Jai   27
K1   Princi   24
K2   Gaurav   22 

           Address Qualification
key Y                          
K0  Y0  Allahabad           MCA
K1  Y1    Kannuaj           Phd
K2  Y2  Allahabad          Bcom
    Y3     Kanpur        B.hons


In [25]:
# joining singly indexed with
# multi indexed
result = df.join(df1, how='inner')
 
result

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Age,Address,Qualification
key,Y,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
K0,Y0,Jai,27,Allahabad,MCA
K1,Y1,Princi,24,Kannuaj,Phd
K2,Y2,Gaurav,22,Allahabad,Bcom
K2,Y3,Gaurav,22,Kanpur,B.hons
