#### Data Analysis and Manipulation with Pandas

In [1]:
import pandas as pd
import numpy as np

### 1. Pandas Series

In [2]:
s1 = pd.Series([1, 2 ,3 , 4, 5])
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [3]:
type(s1)

pandas.core.series.Series

In [4]:
s1 = pd.Series([1, 2 , 3, 4, 5], index=['x', 'x1', 'x2', 'x3', 'x4'])
s1

x     1
x1    2
x2    3
x3    4
x4    5
dtype: int64

In [5]:
s2 = s1 * 2
s2

x      2
x1     4
x2     6
x3     8
x4    10
dtype: int64

In [6]:
s3 = s1 * 2
s3

x      2
x1     4
x2     6
x3     8
x4    10
dtype: int64

### 2. Pandas DataFrame

In [7]:
df = pd.DataFrame({
    'Name': ['Ann', 'Bob', 'Noor'],
    'Marks': [76, 88, 86],
    'Grades': ['B', 'A', 'A']
})
df

Unnamed: 0,Name,Marks,Grades
0,Ann,76,B
1,Bob,88,A
2,Noor,86,A


### 3. Working with iris.csv dataset

In [8]:
iris = pd.read_csv('iris.csv')
iris

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [9]:
type(iris)

pandas.core.frame.DataFrame

In [10]:
first_column = iris['sepal.length']
first_column

0      5.1
1      4.9
2      4.7
3      4.6
4      5.0
      ... 
145    6.7
146    6.3
147    6.5
148    6.2
149    5.9
Name: sepal.length, Length: 150, dtype: float64

In [11]:
type(first_column)

pandas.core.series.Series

In [12]:
iris.head()     # If want more value, put the value into "head()"

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [13]:
iris.tail()     # If want more value, put the value into "tail()"

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica
149,5.9,3.0,5.1,1.8,Virginica


In [14]:
iris.shape

(150, 5)

In [15]:
iris.describe()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [16]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  150 non-null    float64
 1   sepal.width   150 non-null    float64
 2   petal.length  150 non-null    float64
 3   petal.width   150 non-null    float64
 4   variety       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


### Loc and iloc
loc and iloc are for location,
loc is without index,
iloc is with index

In [17]:
iris.iloc[1:4, 1:3]     # iloc[row, col]    [start is inclued, end is excluded]

Unnamed: 0,sepal.width,petal.length
1,3.0,1.4
2,3.2,1.3
3,3.1,1.5


In [18]:
iris.iloc[5:11, 2:]

Unnamed: 0,petal.length,petal.width,variety
5,1.7,0.4,Setosa
6,1.4,0.3,Setosa
7,1.5,0.2,Setosa
8,1.4,0.2,Setosa
9,1.5,0.1,Setosa
10,1.5,0.2,Setosa


In [19]:
iris.loc[0:11, ('petal.length', 'petal.width')]     # [Both start and end are included]

Unnamed: 0,petal.length,petal.width
0,1.4,0.2
1,1.4,0.2
2,1.3,0.2
3,1.5,0.2
4,1.4,0.2
5,1.7,0.4
6,1.4,0.3
7,1.5,0.2
8,1.4,0.2
9,1.5,0.1


### Drop method

In [20]:
new_iris = iris.drop('sepal.length', axis=1)
new_iris

Unnamed: 0,sepal.width,petal.length,petal.width,variety
0,3.5,1.4,0.2,Setosa
1,3.0,1.4,0.2,Setosa
2,3.2,1.3,0.2,Setosa
3,3.1,1.5,0.2,Setosa
4,3.6,1.4,0.2,Setosa
...,...,...,...,...
145,3.0,5.2,2.3,Virginica
146,2.5,5.0,1.9,Virginica
147,3.0,5.2,2.0,Virginica
148,3.4,5.4,2.3,Virginica


In [1]:
new_iris = new_iris.drop('petal.length', axis=1, inplace=True)
new_iris

NameError: name 'new_iris' is not defined

In [22]:
new_iris1 = new_iris.drop([2, 145, 148], axis = 0)

AttributeError: 'NoneType' object has no attribute 'drop'

In [24]:
iris.median()

  iris.median()


sepal.length    5.80
sepal.width     3.00
petal.length    4.35
petal.width     1.30
dtype: float64

In [25]:
iris.min()

sepal.length       4.3
sepal.width        2.0
petal.length       1.0
petal.width        0.1
variety         Setosa
dtype: object

In [26]:
iris.max()

sepal.length          7.9
sepal.width           4.4
petal.length          6.9
petal.width           2.5
variety         Virginica
dtype: object

### Apply method
can use apply method to define your own user defined functions on the dataset

In [27]:
iris.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [28]:
def mult(s):
    return s * 3

df = iris[['petal.length']].apply(mult)
df.head()

Unnamed: 0,petal.length
0,4.2
1,4.2
2,3.9
3,4.5
4,4.2


In [29]:
def half(s):
    return s * 0.5

df = iris[['petal.length']].apply(half)
df.head()

Unnamed: 0,petal.length
0,0.7
1,0.7
2,0.65
3,0.75
4,0.7


### Counting and sorting

In [30]:
iris['variety'].value_counts()

Setosa        50
Versicolor    50
Virginica     50
Name: variety, dtype: int64

In [31]:
iris['petal.length'].value_counts()

1.4    13
1.5    13
5.1     8
4.5     8
1.6     7
1.3     7
5.6     6
4.7     5
4.9     5
4.0     5
4.2     4
5.0     4
4.4     4
4.8     4
1.7     4
3.9     3
4.6     3
5.7     3
4.1     3
5.5     3
6.1     3
5.8     3
3.3     2
5.4     2
6.7     2
5.3     2
5.9     2
6.0     2
1.2     2
4.3     2
1.9     2
3.5     2
5.2     2
3.0     1
1.1     1
3.7     1
3.8     1
6.6     1
6.3     1
1.0     1
6.9     1
3.6     1
6.4     1
Name: petal.length, dtype: int64

In [33]:
iris.sort_values(by = 'variety', ascending=False)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
149,5.9,3.0,5.1,1.8,Virginica
111,6.4,2.7,5.3,1.9,Virginica
122,7.7,2.8,6.7,2.0,Virginica
121,5.6,2.8,4.9,2.0,Virginica
120,6.9,3.2,5.7,2.3,Virginica
...,...,...,...,...,...
31,5.4,3.4,1.5,0.4,Setosa
30,4.8,3.1,1.6,0.2,Setosa
29,4.7,3.2,1.6,0.2,Setosa
28,5.2,3.4,1.4,0.2,Setosa


In [35]:
iris.sort_values(by = ['variety', 'petal.length'], ascending=[True, False])

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
24,4.8,3.4,1.9,0.2,Setosa
44,5.1,3.8,1.9,0.4,Setosa
5,5.4,3.9,1.7,0.4,Setosa
18,5.7,3.8,1.7,0.3,Setosa
20,5.4,3.4,1.7,0.2,Setosa
...,...,...,...,...,...
123,6.3,2.7,4.9,1.8,Virginica
127,6.1,3.0,4.9,1.8,Virginica
126,6.2,2.8,4.8,1.8,Virginica
138,6.0,3.0,4.8,1.8,Virginica
