In [2]:
import numpy as np
import pandas as pd

# 1. Data Structure
## 1.1 <b> Pandas Series </b>
- A 1-dimensional labeled array
- Supports many data types
- Axis labels $\rightarrow$ index (get and set values by index label)
- Valid argument to most NumPy
methods

In [2]:
data0 = pd.Series(data = [1,2,3.,4,5,6,7,8])
data0

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
6    7.0
7    8.0
dtype: float64

In [3]:
#data1 = pd.Series(data = [1,2,3.,4,5,6,7,8], index = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
data1 = pd.Series(data = [1,2,3.,4,5,6,7,8,9], index = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h','a'])
data1

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
f    6.0
g    7.0
h    8.0
a    9.0
dtype: float64

In [4]:
 data1.index
# data1[4]
# data1[[2,4,5]]
# data1['a']
# 'a' in data1
# data1*3
# data1**3
# data1/2
# data1

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'a'], dtype='object')

##  1.2. <b>DataFrame </b>
- A 2-dimensional labeled data
structure
- A dictionary of Series objects
    - Columns can be of potentially
different types
- Optionally parameters for fine-tuning:
    - index (row labels)
    - columns (column labels)

Pandas provides many constructors to create DataFrames!

In [5]:
data2 = {'GradeS1': pd.Series(data = [70,50,67,84,98], 
                              index = ['Maths', 'Art', 'Sport', 'Biology', 'Physics']),
         'GradeS2': pd.Series(data = [90,70,88,78,76,77,87], 
                              index = ['Maths', 'Biology', 'Physics', 'chemistry', 'Economics', 'Finance', 'Geology'])}
data2 = pd.DataFrame(data2)
data2

Unnamed: 0,GradeS1,GradeS2
Art,50.0,
Biology,84.0,70.0
Economics,,76.0
Finance,,77.0
Geology,,87.0
Maths,70.0,90.0
Physics,98.0,88.0
Sport,67.0,
chemistry,,78.0


In [6]:
data2.index
data2.columns

Index(['GradeS1', 'GradeS2'], dtype='object')

In [7]:
d= pd.DataFrame(data2,index=['Maths', 'Physics','Economics'])#Create another data frame with some index of data2
d

Unnamed: 0,GradeS1,GradeS2
Maths,70.0,90.0
Physics,98.0,88.0
Economics,,76.0


In [8]:
d= pd.DataFrame(data2,index=['Maths', 'Physics','Economics', 'elvira'], 
                columns= ('Grade12', 'GradeS2','GradeS1'))
d

Unnamed: 0,Grade12,GradeS2,GradeS1
Maths,,90.0,70.0
Physics,,88.0,98.0
Economics,,76.0,
elvira,,,


# 2. Reading and Writing Data

## 2.1 <b> pandas: Data Ingestion (Reading) </b>
- Functions such as <font color = red> ‘read_excel’, ‘read_csv’, 'read_json', 'read_html', 'read_sql_query', 'read_sql_table' </b>, etc.

<b> Syntax: pandas.function(file_directory) </b>

In [9]:
ddat = pd.read_excel('Climate_Dynamics.xlsx')
ddat
ddat.head()
ddat.head(3)
ddat.tail()

FileNotFoundError: [Errno 2] No such file or directory: 'Climate_Dynamics.xlsx'

In [None]:
ddatt = pd.ExcelFile('DefenseSchedule2020forTutors.xlsx')
sheet1_df = pd.read_excel(ddatt, 'For Examiners ')
sheet2_df = pd.read_excel(ddatt, 'For Tutors ')
sheet1_df.head(2)
# sheet2_df.head(3)

In [None]:
df = pd.read_json("https://data.smcgov.org/resource/mb6a-xn89.json")#JavaScript Object Notation
df.head()

In [None]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data')
data.head()

In [None]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data')
data.head()

## 2.2. Data writing

Save the dataframe in the current directory

In [None]:
#create a dataframe
data2 = {'GradeS1': pd.Series(data = [70,50,67,84,98], 
                              index = ['Maths', 'Art', 'Sport', 'Biology', 'Physics']),
         'GradeS2': pd.Series(data = [90,70,88,78,76,77,87], 
                              index = ['Maths', 'Biology', 'Physics', 'chemistry', 'Economics', 'Finance', 'Geology'])}
data2 = pd.DataFrame(data2)
#==========================================
# index = False parameter will not write the index values, default is True
#data2.to_csv('data_new.csv', index=True)
#data2.to_csv('data_new.txt', sep='\t', index=True)
# data2.to_excel('data_new.xlsx',sheet_name='Sheet1', index = False)
data2.head()

<b>Create a folder in the current directory and save the dataframe inside</b>

In [None]:
#==============================================
# create a folder if it not exists
Name_of_folder = 'dbasc'
import os
if not os.path.exists(Name_of_folder):
    os.makedirs(Name_of_folder)
#==============================================
# index = False parameter will not write the index values, default is True
data2.to_csv(Name_of_folder+'/data_new.csv', index=False)
# data2.to_csv(Name_of_folder+'/mtcars_new.txt', sep='\t', index=False)
# data2.to_excel(Name_of_folder+'/mtcars_new.xlsx',sheet_name='Sheet1', index = False)

In [None]:
data1

# 3. Basic Statistics Summary
<b> pandas in Descriptive Statistics </b>
Functions such as 'describe()', 'corr()', 'min()', 'max()', 'mode()', 'median()', 'std()', etc.

<b> Syntax : data_frame.function() </b>

In [None]:
#data1
data1.describe()

In [None]:
data2

In [None]:
data2.describe()

In [None]:
data2.loc['Maths',:].describe()

In [None]:
data2.corr()

In [None]:
data2.max()

In [None]:
data2.cov()

In [None]:
data2.std()

In [None]:
data2.mean()

In [None]:
df.head()

# 4. Viewing Data

In [3]:
dic = {'Col1':[1, -2, 3, 1, 5, 6, 1, -3, 1, 1],
      'Col2': [4, -3 , 7, 8, 9, 5, 6, 4, -2, 2],
      'Col3': ['Euler', 'A', 'A', 'Euler', 'A', 'A', 'RK4', 'A', 'Euler', 'RK4']}

df = pd.DataFrame(dic, index = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'])
df

Unnamed: 0,Col1,Col2,Col3
a,1,4,Euler
b,-2,-3,A
c,3,7,A
d,1,8,Euler
e,5,9,A
f,6,5,A
g,1,6,RK4
h,-3,4,A
i,1,-2,Euler
j,1,2,RK4


## 4.1 The characteristics of a dataframe

In [None]:
df.columns       #Get column names 
df.index         # Get dataframe index 
df.values        # Get values
df.dtypes        # Get column datatypes
df.shape



## 4.2 Access, slice and update the values of a dataframe
To access, slice or update the value of a dataframe one can use the methods <b>loc or iloc</b>.

<b>loc</b> gets rows (or columns) with particular labels from the index. <b>iloc</b> gets rows (or columns) at particular positions in the index (so it only takes integers). 
### 4.2.1 Selecting pandas data using “iloc”

The <b>iloc</b> indexer for Pandas Dataframe is used for <b>integer-location based indexing / selection by position</b>.

The iloc indexer syntax is 
$$\text{data.iloc[<row selection>, <column selection>]},$$ 

which is sure to be a source of confusion for R users. <b>iloc</b> in pandas is used to select rows and columns by number, in the order that they appear in the data frame. You can imagine that each row has a row number from 0 to the total rows (<b>data.shape[0]</b>)  and <b>iloc[...]</b> allows selections based on these numbers. The same applies for columns (ranging from 0 to <b>data.shape[1]</b>)

There are two “arguments” to iloc – a row selector, and a column selector.  For example:

In [None]:
data = pd.read_csv('https://s3-eu-west-1.amazonaws.com/shanebucket/downloads/uk-500.csv')
# set a numeric id for use as an index for examples.
np.random.seed(21)
data['id'] = [np.random.randint(0,1000) for x in range(data.shape[0])]
 
data.head(5)

In [None]:
# # Single selections using iloc and DataFrame
# # Rows:
data.iloc[0] # first row of data frame (Aleshia Tomkiewicz) - Note a Series data type output.
data.iloc[1] # second row of data frame (Evan Zigomalas)
data.iloc[-1] # last row of data frame (Mi Richan)
# # Columns:
data.iloc[:,0] # first column of data frame (first_name)
data.iloc[:,1] # second column of data frame (last_name)
data.iloc[:,-1] # last column of data frame (id)

<b>Note that .iloc returns a Pandas Series when one row is selected, and a Pandas DataFrame when multiple rows are selected, or if any column in full is selected. To counter this, pass a single-valued list if you require DataFrame output.</b>


In [None]:
data.iloc[:,[-1]] # last column of data frame (id)

Multiple columns and rows can be selected together using the <b>.iloc</b> indexer.

In [None]:
# # Multiple row and column selections using iloc and DataFrame
data.iloc[0:5] # first five rows of dataframe
data.iloc[:, 0:2] # first two columns of data frame with all rows
data.iloc[[0,3,6,24], [0,5,6]] # 1st, 4th, 7th, 25th row + 1st 6th 7th columns.
data.iloc[0:5, 5:8] # first 5 rows and 5th, 6th, 7th columns of data frame (county -> phone1).


### 4.2.2. Selecting pandas data using “loc”

The Pandas loc indexer can be used with DataFrames for two different use cases:

    - Selecting rows by label/index
    - Selecting rows with a boolean / conditional lookup
The loc indexer is used with the same syntax as iloc: 
$$\text{data.loc[<row selection>, <column selection>]}$$

#### a. Label-based / Index-based indexing using .loc

Selections using the loc method are based on the index of the data frame (if any). Where the index is set on a DataFrame, using <b>df.set_index()</b>, the .loc method directly selects based on index values of any rows. For example, setting the index of our test data frame to the persons “last_name”:

In [4]:
data = pd.read_csv('https://s3-eu-west-1.amazonaws.com/shanebucket/downloads/uk-500.csv')
# set a numeric id for use as an index for examples.
np.random.seed(21)
data['id'] = [np.random.randint(0,1000) for x in range(data.shape[0])]
 
data.set_index("last_name", inplace=True)
data.head()

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


Unnamed: 0_level_0,first_name,company_name,address,city,county,postal,phone1,phone2,email,web,id
last_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Tomkiewicz,Aleshia,Alan D Rosenburg Cpa Pc,14 Taylor St,St. Stephens Ward,Kent,CT2 7PP,01835-703597,01944-369967,atomkiewicz@hotmail.com,http://www.alandrosenburgcpapc.co.uk,969
Zigomalas,Evan,Cap Gemini America,5 Binney St,Abbey Ward,Buckinghamshire,HP11 2AX,01937-864715,01714-737668,evan.zigomalas@gmail.com,http://www.capgeminiamerica.co.uk,207
Andrade,France,"Elliott, John W Esq",8 Moor Place,East Southbourne and Tuckton W,Bournemouth,BH6 3BE,01347-368222,01935-821636,france.andrade@hotmail.com,http://www.elliottjohnwesq.co.uk,824
Mcwalters,Ulysses,"Mcmahan, Ben L",505 Exeter Rd,Hawerby cum Beesby,Lincolnshire,DN36 5RP,01912-771311,01302-601380,ulysses@hotmail.com,http://www.mcmahanbenl.co.uk,772
Veness,Tyisha,Champagne Room,5396 Forth Street,Greets Green and Lyng Ward,West Midlands,B70 9DT,01547-429341,01290-367248,tyisha.veness@hotmail.com,http://www.champagneroom.co.uk,48


Now with the index set, we can directly select rows for different “last_name” values using <b>.loc[<label>]</b>  – either singly, or in multiples. For example:

In [5]:
#data.loc['Veness']
data.loc[['Andrade','Veness']]

Unnamed: 0_level_0,first_name,company_name,address,city,county,postal,phone1,phone2,email,web,id
last_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Andrade,France,"Elliott, John W Esq",8 Moor Place,East Southbourne and Tuckton W,Bournemouth,BH6 3BE,01347-368222,01935-821636,france.andrade@hotmail.com,http://www.elliottjohnwesq.co.uk,824
Veness,Tyisha,Champagne Room,5396 Forth Street,Greets Green and Lyng Ward,West Midlands,B70 9DT,01547-429341,01290-367248,tyisha.veness@hotmail.com,http://www.champagneroom.co.uk,48


<b> Selecting single or multiple rows using .loc index selections with pandas. Note that the first example returns a series, and the second returns a DataFrame. You can achieve a single-column DataFrame by passing a single-element list to the .loc operation.</b>

In [None]:
data.loc[['Veness']]

Select columns with <b>.loc</b> using the names of the columns.

In [6]:
data.loc[['Andrade','Veness'], ['first_name','address','city']]

Unnamed: 0_level_0,first_name,address,city
last_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Andrade,France,8 Moor Place,East Southbourne and Tuckton W
Veness,Tyisha,5396 Forth Street,Greets Green and Lyng Ward


When using the .loc indexer, columns are referred to by names using lists of strings, or “:” slices.

You can select ranges of index labels – the selection <b>data.loc['Bruch':'Julio']</b> will return all rows in the data frame between the index entries for <b>'Bruch'</b> and <b>'Julio'</b>. The following examples should now make sense:

In [None]:
# Select rows with index values 'Andrade' and 'Veness', with all columns between 'city' and 'email'
data.loc[['Andrade', 'Veness'], 'city':'email']
# # Select all rows between 'Andrade' and 'Veness', with just 'first_name', 'address' and 'city' columns
data.loc['Andrade':'Veness', ['first_name', 'address', 'city']]
# # Select all rows between 'Andrade' and 'Veness', with all columns between 'city' and 'email' 
data.loc['Andrade':'Veness', 'city':'email']

Note that in the last example, <b>data.loc[207]</b> (the row with index value 207) is not equal to <b>data.iloc[207]</b> (the 207th row in the data)

In [7]:
data = pd.read_csv('https://s3-eu-west-1.amazonaws.com/shanebucket/downloads/uk-500.csv')
# set a numeric id for use as an index for examples.
np.random.seed(21)
data['id'] = [np.random.randint(0,1000) for x in range(data.shape[0])]
 
# Change the index to be based on the 'id' column
data.set_index('id', inplace=True)
data.head()

Unnamed: 0_level_0,first_name,last_name,company_name,address,city,county,postal,phone1,phone2,email,web
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
969,Aleshia,Tomkiewicz,Alan D Rosenburg Cpa Pc,14 Taylor St,St. Stephens Ward,Kent,CT2 7PP,01835-703597,01944-369967,atomkiewicz@hotmail.com,http://www.alandrosenburgcpapc.co.uk
207,Evan,Zigomalas,Cap Gemini America,5 Binney St,Abbey Ward,Buckinghamshire,HP11 2AX,01937-864715,01714-737668,evan.zigomalas@gmail.com,http://www.capgeminiamerica.co.uk
824,France,Andrade,"Elliott, John W Esq",8 Moor Place,East Southbourne and Tuckton W,Bournemouth,BH6 3BE,01347-368222,01935-821636,france.andrade@hotmail.com,http://www.elliottjohnwesq.co.uk
772,Ulysses,Mcwalters,"Mcmahan, Ben L",505 Exeter Rd,Hawerby cum Beesby,Lincolnshire,DN36 5RP,01912-771311,01302-601380,ulysses@hotmail.com,http://www.mcmahanbenl.co.uk
48,Tyisha,Veness,Champagne Room,5396 Forth Street,Greets Green and Lyng Ward,West Midlands,B70 9DT,01547-429341,01290-367248,tyisha.veness@hotmail.com,http://www.champagneroom.co.uk


In [8]:
i = data.index[1]
print('i = ',i)
#select the row with 'id' = i
data.loc[[i]]

i =  207


Unnamed: 0_level_0,first_name,last_name,company_name,address,city,county,postal,phone1,phone2,email,web
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
207,Evan,Zigomalas,Cap Gemini America,5 Binney St,Abbey Ward,Buckinghamshire,HP11 2AX,01937-864715,01714-737668,evan.zigomalas@gmail.com,http://www.capgeminiamerica.co.uk
207,Eva,Joulwan,Central Hrdwr & Elec Corp,7 Lear Rd,Stroud,Hampshire,GU32 3PQ,01779-720349,01961-802899,eva.joulwan@gmail.com,http://www.centralhrdwreleccorp.co.uk
207,Allene,Burau,Allied Plastics,8 Barlow St #6,Barlow,Derbyshire,S18 7TH,01731-825958,01260-963065,allene.burau@hotmail.com,http://www.alliedplastics.co.uk


In [None]:
data.iloc[[i]]

### b. Boolean / Logical indexing using .loc

Conditional selections with boolean arrays using $\text{data.loc[<selection>]}$ is the most common method that I use with Pandas DataFrames. With boolean indexing or logical selection, you pass an array or Series of True/False values to the <b>.loc</b> indexer to select the rows where your Series has True values.

In most use cases, you will make selections based on the values of different columns in your data set.

For example, the statement $$\text{data[‘first_name’] == ‘Antonio’]}$$ produces a Pandas Series with a True/False value for every row in the ‘data’ DataFrame, where there are “True” values for the rows where the first_name is “Antonio”. These type of boolean arrays can be passed directly to the <b>.loc</b> indexer as so:

In [None]:
data = pd.read_csv('https://s3-eu-west-1.amazonaws.com/shanebucket/downloads/uk-500.csv')
# set a numeric id for use as an index for examples.
np.random.seed(21)
data['id'] = [np.random.randint(0,1000) for x in range(data.shape[0])]
 
data.set_index("last_name", inplace=True)

data.loc[data['first_name'] == 'Antonio']


As before, a second argument can be passed to .loc to select particular columns out of the data frame. Again, columns are referred to by name for the loc indexer and can be a single string, a list of columns, or a slice “:” operation.

In [None]:
data.loc[data['first_name'] == 'Antonio', ['email', 'address','city']]

Note that when selecting columns, if one column only is selected, the .loc operator returns a Series. For a single column DataFrame, use a one-element list to keep the DataFrame format, for example:

In [None]:
data.loc[data['first_name'] == 'Antonio', 'email']
data.loc[data['first_name'] == 'Antonio', ['email']]

# 5. Basic Operations

## 5.1 Sorting values in the DataFrame

In [None]:
dic = {'Col1':[1, -2, 3, 1, 5, 6, 1, -3, 1, 1],
      'Col2': [4, -3 , 7, 8, 9, 5, 6, 4, -2, 2],
      'Col3': ['Euler', 'A', 'A', 'Euler', 'A', 'A', 'RK4', 'A', 'Euler', 'RK4']}

df = pd.DataFrame(dic, index = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'])

# #=============================================================
# #Create a new columns called 'X' and 'Y' in the dataframe df
df['X'] = ['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B']
df['Y'] = ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B']
# #============================================================
# df
df.loc['z'] = [1,2,3,4,5]
# #df

df


In [None]:
df.sort_values(by =['Col1'],ascending=[True], inplace=True)# Sort DataFrame 
df.sort_values(by =['Col1', 'Col2'],ascending=[False,False], inplace=True)# Sort DataFrame
df

## 5.2. Replace NaN Values in Pandas DataFrame

Depending on the scenario, you may use either of the 4 methods below in order to replace NaN values with zeros in Pandas DataFrame:

let us consider the following dataframe

In [None]:
data2 = {'GradeS1': pd.Series(data = [70,50,67,84,98], 
                              index = ['Maths', 'Art', 'Sport', 'Biology', 'Physics']),
         'GradeS2': pd.Series(data = [90,70,88,78,76,77,87], 
                              index = ['Maths', 'Biology', 'Physics', 'chemistry', 'Economics', 'Finance', 'Geology']),
        'GradeS3': pd.Series(data = [69,75,73,87], 
                              index = ['Maths', 'chemistry', 'Economics', 'Geology']),
        'GradeS4': pd.Series(data = [90,70,88,78,76,77,87,98,90], 
                              index = ['Maths', 'Biology', 'Physics', 'chemistry', 'Economics', 'Finance', 'Geology','Art', 'Sport'])}
data2 = pd.DataFrame(data2)
data2

1. For a single column or entire datafame using NumPy: 

In [None]:
#Make a deep copy of the data2
data3 = data2.copy()
#Replace NaN by 3 in the column 'GradeS2' of the dataframe using numpy
data3['GradeS2'].replace(np.nan,30, inplace=True)
data3

In [None]:
#Make a deep copy of the data2
data3 = data2.copy()
#Replace NaN by 3 in the entire dataframe using numpy
data3.replace(np.nan,30, inplace=True)
data3

2. For a single column or entire datafame using Pandas: 

In [None]:
#Make a deep copy of the data2
data3 = data2.copy()
#Replace NaN by 3 in the column 'GradeS2' using Pandas
data3['GradeS2'].fillna(30, inplace=True)
data3

In [None]:
#Make a deep copy of the data2
data3 = data2.copy()
#Replace NaN by 3 in the entire dataframe using Pandas
data3.fillna(3, inplace=True)
data3

Replace missing values with last valid observation (useful in time series
data). For example, temperature does not change drastically compared to previous observation. So better approach is to fill NA is to forward or backward fill. There are mainly two methods available

In [None]:
#Make a deep copy of the data2
data3 = data2.copy()
#Replace NaN by next value in the column 'GradeS1' of the dataframe using Pandas
data3['GradeS1'].fillna(method= 'bfill', inplace=True)
##Note that the last NaN was not replaced because there is no next value for him


##Replace NaN by next value in the column 'GradeS1' of the dataframe using Pandas
data3.fillna(method= 'bfill', inplace=True)
data3

In [None]:
#Make a deep copy of the data2
data3 = data2.copy()
#Replace NaN by previous value in the column 'GradeS2' of the dataframe using Pandas
data3['GradeS2'].fillna(method= 'ffill', inplace=True)
##Note that the first NaN was not replaced because there is no next value for him


##Replace NaN by previous value in the entire dataframe using Pandas
data3.fillna(method= 'ffill', inplace=True)
data3

## 5.3. Delete NaN Values with Zeros in Pandas DataFrame

In [None]:
#Make a deep copy of the data2
data3 = data2.copy()
#Delete all rows that contain NaN
data3.dropna(axis=0, inplace=True)
data3

In [None]:
#Make a deep copy of the data2
data3 = data2.copy()
#Delete all columns that contain NaN
data3.dropna(axis=1, inplace=True)
data3

## 5.4. Grouping
Grouping involves one or more of the following steps:
- Splitting the data into groups based on some criteria,
- Applying a function to each group independently,
- Combining the results into a data structure

Let us consider the following data

In [None]:
df = pd.DataFrame({'Name' : ['jack', 'jane', 'jack', 'jane', 'jack', 'jane','jack', 'jane'],
                   'State' : ['SFO', 'SFO', 'NYK', 'CA', 'NYK', 'NYK','SFO', 'CA'],
                   'Grade':['A','A','B','A','C','B','C','A'],
                   'Age' : np.random.randint(24, 50, size=8)})
df

Find max age by Name / State

In [None]:
df.groupby(['Name','State']).max()

# 6. Plot of a dataframe

In [None]:
data2

In [None]:
import matplotlib.pyplot as plt
#data2.plot?
data2.plot(figsize=(10,10), kind= 'bar' )
plt.show()

In [None]:
data2.plot?