# Operations

There are lots of operations with pandas that will be really useful to you, but don't fall into any distinct category. Let's show them here in this lecture:

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('tennis.csv')
data

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


## Inspecting Data set

### data.info()

prints information about a DataFrame including the index dtype and column dtypes, non-null values and memory usage.

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
outlook     14 non-null object
temp        14 non-null object
humidity    14 non-null object
windy       14 non-null bool
play        14 non-null object
dtypes: bool(1), object(4)
memory usage: 542.0+ bytes


### data.head()

Return the first `n` rows data,default
n value is 5 if not specified

In [4]:
data.head(10) # first 10 records in data

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


## data.tail()

Return the last `n` rows.

In [5]:
data.tail(5) # Bottom 5 records in data

Unnamed: 0,outlook,temp,humidity,windy,play
9,rainy,mild,normal,False,yes
10,sunny,mild,normal,True,yes
11,overcast,mild,high,True,yes
12,overcast,hot,normal,False,yes
13,rainy,mild,high,True,no


## data.columns

Get column names of a data frame

In [6]:
data.columns

Index(['outlook', 'temp', 'humidity', 'windy', 'play'], dtype='object')

## data.index

Get dataframe index

In [7]:
data.index

RangeIndex(start=0, stop=14, step=1)

## data.dtypes

Get column datatypes

In [8]:
data.dtypes

outlook     object
temp        object
humidity    object
windy         bool
play        object
dtype: object

### data.shape

Returns number of rows and columns

In [9]:
data.shape

(14, 5)

### data.values

Only the values in the DataFrame will be returned, the axes labels
will be removed.

In [10]:
data.values 

array([['sunny', 'hot', 'high', False, 'no'],
       ['sunny', 'hot', 'high', True, 'no'],
       ['overcast', 'hot', 'high', False, 'yes'],
       ['rainy', 'mild', 'high', False, 'yes'],
       ['rainy', 'cool', 'normal', False, 'yes'],
       ['rainy', 'cool', 'normal', True, 'no'],
       ['overcast', 'cool', 'normal', True, 'yes'],
       ['sunny', 'mild', 'high', False, 'no'],
       ['sunny', 'cool', 'normal', False, 'yes'],
       ['rainy', 'mild', 'normal', False, 'yes'],
       ['sunny', 'mild', 'normal', True, 'yes'],
       ['overcast', 'mild', 'high', True, 'yes'],
       ['overcast', 'hot', 'normal', False, 'yes'],
       ['rainy', 'mild', 'high', True, 'no']], dtype=object)

## data['column name'].unique()

Get unique values of a specified column.

In [11]:
data['outlook'].unique()

array(['sunny', 'overcast', 'rainy'], dtype=object)

## data['column name'].nunique()

The nunique( ) shows the number of unique values.

In [12]:
data['outlook'].nunique()

3

## data['column name'].value_counts()

value_counts( ) creates a frequency distribution. By default ascending = False i.e. it will show the 'Index' having the maximum frequency on the top.

In [13]:
data['outlook'].value_counts()

sunny       5
rainy       5
overcast    4
Name: outlook, dtype: int64

## Data Cleaning

### data.columns = ['new col name1','new col name2']

Rename `all` column names of
DataFrame at a time.

In [14]:
data.columns

Index(['outlook', 'temp', 'humidity', 'windy', 'play'], dtype='object')

In [15]:
data.columns = ['OUTLOOK','TEMP','HUMIDITY','WINDY','PLAY']

In [16]:
data

Unnamed: 0,OUTLOOK,TEMP,HUMIDITY,WINDY,PLAY
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


### data.rename()

`data.rename(columns={‘old_columnname’:‘new_
columnname’}, inplace=True)`

Rename a specific column names.

In [17]:
data.rename(columns={"OUTLOOK":"outlook"},inplace=True)

In [18]:
data

Unnamed: 0,outlook,TEMP,HUMIDITY,WINDY,PLAY
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


### data['column name'].astype( )

Suppose we want to convert it to float (numeric variable with decimals).

In [None]:
data.dtypes

### data.replace()

Replace values given in `to_replace` with `value`.Values of the DataFrame are replaced with other values dynamically.

`data.replace(to_replace,value)`

In [None]:
data["PLAY"].replace(["yes","no"],[1,0],inplace = True)

In [None]:
data

### data.set_index()

Set the DataFrame index (row labels) using one or more existing
columns. By default yields a new object.

In [None]:
data.set_index("outlook")

In [None]:
data.set_index(["outlook","humidity"])

## Sort and Filter the Data

### data.sort_values()

`df.sort_values(by =[‘Column1’, ‘Column2’],ascending=[True,True’])`

Sort by the values along either axis

In [None]:
data.sort_values(by='outlook')

In [None]:
data.sort_values(by = ['outlook','temp'])

## data.sort_index()

Sort object by labels (along an axis)

In [None]:
data.sort_index(ascending=False)

## data.T

To transpose rows into columns and columns into rows

In [None]:
data.T

## data .drop()

Pandas provide data analysts a way to delete and filter data frame using .drop() method. Rows or columns can be removed using index label or column name using this method.

`**DataFrame.drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors=’raise’)`**

- labels: String or list of strings referring row or column name.
- axis: int or string value, 0 ‘index’ for Rows and 1 ‘columns’ for Columns.
- index or columns: Single label or list. index or columns are an alternative to axis and cannot be used together.
- level: Used to specify level in case data frame is having multiple level index.
- inplace: Makes changes in original Data Frame if True.
- errors: Ignores error if any value from the list doesn’t exists and drops rest of the values when errors = ‘ignore’

In [21]:
data.head(2)

Unnamed: 0,outlook,TEMP,HUMIDITY,WINDY,PLAY
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no


In [22]:
data.drop('outlook',axis = 1) # Drop outlook column

Unnamed: 0,TEMP,HUMIDITY,WINDY,PLAY
0,hot,high,False,no
1,hot,high,True,no
2,hot,high,False,yes
3,mild,high,False,yes
4,cool,normal,False,yes
5,cool,normal,True,no
6,cool,normal,True,yes
7,mild,high,False,no
8,cool,normal,False,yes
9,mild,normal,False,yes


In [24]:
data.drop(['outlook','WINDY'],axis = 1) # Drop outlook and windy column

Unnamed: 0,TEMP,HUMIDITY,PLAY
0,hot,high,no
1,hot,high,no
2,hot,high,yes
3,mild,high,yes
4,cool,normal,yes
5,cool,normal,no
6,cool,normal,yes
7,mild,high,no
8,cool,normal,yes
9,mild,normal,yes


## Data Subset

In [None]:
data[data['play']=='yes']

In [None]:
data[(data['play']=='yes') & (data['temp']=='hot')]

In [None]:
data [(data['outlook']=='sunny') | (data['outlook']=='overcast')]

## DataFrame.query()

Analyzing data requires a lot of filtering operations. Pandas provide many methods to filter a Data frame and `Dataframe.query()`

**`DataFrame.query(expr, inplace=False)`**

- expr: Expression in string form to filter data.
- inplace: Make changes in the original data frame if True


In [None]:
data.query("outlook == 'sunny'")

In [None]:
data.query(" outlook == 'sunny' and play =='yes'")

In [None]:
data.query("temp == 'cool' or temp == 'mild' and play == 'no'")