In [1]:
import pandas as pd

In [2]:
# create a new Series dictionary
veg = pd.Series({'potato':3,'tomato':6,'chilly':2,'pumpkin':8,'carrot':10,'radish':4})
veg

potato      3
tomato      6
chilly      2
pumpkin     8
carrot     10
radish      4
dtype: int64

In [3]:
# extracting all the values
veg.values

array([ 3,  6,  2,  8, 10,  4], dtype=int64)

In [4]:
# we need all the indexes that we have
veg.index

Index(['potato', 'tomato', 'chilly', 'pumpkin', 'carrot', 'radish'], dtype='object')

In [5]:
#shape (total number of record/element we have)
veg.shape

(6,)

In [6]:
# subset series

#option_1
veg[0] # we call the first element using position/index

3

In [7]:
#option_2 
veg['carrot'] #we call back the value with the label

10

In [8]:
veg['pumpkin':'radish'] #range of records

pumpkin     8
carrot     10
radish      4
dtype: int64

In [9]:
veg[1:4] #from 1 to 3 / last range is excluded it will got till 3 index only 


tomato     6
chilly     2
pumpkin    8
dtype: int64

In [10]:
veg[['tomato','radish','pumpkin']]

tomato     6
radish     4
pumpkin    8
dtype: int64

In [11]:
veg['tomato':] #from tomate to the last element

tomato      6
chilly      2
pumpkin     8
carrot     10
radish      4
dtype: int64

In [12]:
veg[ :'pumpkin'] #from potato to pumpkin

potato     3
tomato     6
chilly     2
pumpkin    8
dtype: int64

In [13]:
veg[1:4] #start by 1 till 3

tomato     6
chilly     2
pumpkin    8
dtype: int64

In [14]:
veg[1::2] #extract every second element

tomato     6
pumpkin    8
radish     4
dtype: int64

In [15]:
veg[1::3] #extract every third element

tomato     6
carrot    10
dtype: int64

In [16]:
veg

potato      3
tomato      6
chilly      2
pumpkin     8
carrot     10
radish      4
dtype: int64

In [17]:
#iloc/loc
#loc: label based indexing
#iloc: position based indexing

In [18]:
#veg.loc['tomato':'carrot']
veg['tomato':'carrot']

tomato      6
chilly      2
pumpkin     8
carrot     10
dtype: int64

In [19]:
#veg.iloc[0:4]
veg[0:4] # potato to pumpking position

potato     3
tomato     6
chilly     2
pumpkin    8
dtype: int64

In [20]:
veg>2 #boolean output / chilly = 2

potato      True
tomato      True
chilly     False
pumpkin     True
carrot      True
radish      True
dtype: bool

In [21]:
veg<2 #boolean output / chilly = 2

potato     False
tomato     False
chilly     False
pumpkin    False
carrot     False
radish     False
dtype: bool

In [22]:
veg>=2 #boolean output / chilly = 2

potato     True
tomato     True
chilly     True
pumpkin    True
carrot     True
radish     True
dtype: bool

In [23]:
veg<=2 #chilly =2

potato     False
tomato     False
chilly      True
pumpkin    False
carrot     False
radish     False
dtype: bool

In [24]:
veg[veg>2] #filtered output

potato      3
tomato      6
pumpkin     8
carrot     10
radish      4
dtype: int64

In [25]:
veg

potato      3
tomato      6
chilly      2
pumpkin     8
carrot     10
radish      4
dtype: int64

In [26]:
veg.head() #for top 5 elements

potato      3
tomato      6
chilly      2
pumpkin     8
carrot     10
dtype: int64

In [27]:
veg.head(2) #for top 2 elements

potato    3
tomato    6
dtype: int64

In [28]:
veg.tail() #5 from bottom

tomato      6
chilly      2
pumpkin     8
carrot     10
radish      4
dtype: int64

In [29]:
veg.tail(1) #last element

radish    4
dtype: int64

In [30]:
veg[2:5]

chilly      2
pumpkin     8
carrot     10
dtype: int64

In [31]:
#check any values belongs to the series 
veg.isin([50])

potato     False
tomato     False
chilly     False
pumpkin    False
carrot     False
radish     False
dtype: bool

In [32]:
veg.isin([2]) #we compare the values

potato     False
tomato     False
chilly      True
pumpkin    False
carrot     False
radish     False
dtype: bool

In [33]:
veg[veg.isin([2,4])]

chilly    2
radish    4
dtype: int64

In [34]:
# find unique values and their frequencies
new = pd.Series(list('abcd'*4))
new

0     a
1     b
2     c
3     d
4     a
5     b
6     c
7     d
8     a
9     b
10    c
11    d
12    a
13    b
14    c
15    d
dtype: object

In [35]:
#unique() : name of all unique records
#nunique() : count of unique records
#value_counts() : name of uniques records with their occurences

In [36]:
new


0     a
1     b
2     c
3     d
4     a
5     b
6     c
7     d
8     a
9     b
10    c
11    d
12    a
13    b
14    c
15    d
dtype: object

In [37]:
new.unique()

array(['a', 'b', 'c', 'd'], dtype=object)

In [38]:
new.nunique()

4

In [39]:
new.value_counts()

a    4
b    4
c    4
d    4
dtype: int64

In [40]:
# dealing with duplicates
new

0     a
1     b
2     c
3     d
4     a
5     b
6     c
7     d
8     a
9     b
10    c
11    d
12    a
13    b
14    c
15    d
dtype: object

In [41]:
new.duplicated()

0     False
1     False
2     False
3     False
4      True
5      True
6      True
7      True
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
dtype: bool

In [42]:
new[new.duplicated()]

4     a
5     b
6     c
7     d
8     a
9     b
10    c
11    d
12    a
13    b
14    c
15    d
dtype: object

In [43]:
new.drop_duplicates() # drop the duplicated values from the list

0    a
1    b
2    c
3    d
dtype: object

In [44]:
#sorting the data
#sort_values, sort_index

In [45]:
veg.sort_values

<bound method Series.sort_values of potato      3
tomato      6
chilly      2
pumpkin     8
carrot     10
radish      4
dtype: int64>

In [46]:
veg.sort_index

<bound method Series.sort_index of potato      3
tomato      6
chilly      2
pumpkin     8
carrot     10
radish      4
dtype: int64>

In [47]:
new.sort_values

<bound method Series.sort_values of 0     a
1     b
2     c
3     d
4     a
5     b
6     c
7     d
8     a
9     b
10    c
11    d
12    a
13    b
14    c
15    d
dtype: object>

In [48]:
new.sort_index

<bound method Series.sort_index of 0     a
1     b
2     c
3     d
4     a
5     b
6     c
7     d
8     a
9     b
10    c
11    d
12    a
13    b
14    c
15    d
dtype: object>

In [50]:
#dealing with missing data
#create a new series
import numpy as np
x = pd.Series([2,4,5,3,np.nan,3,np.nan])
x

0    2.0
1    4.0
2    5.0
3    3.0
4    NaN
5    3.0
6    NaN
dtype: float64

In [51]:
x.isnull().sum

<bound method NDFrame._add_numeric_operations.<locals>.sum of 0    False
1    False
2    False
3    False
4     True
5    False
6     True
dtype: bool>

In [52]:
x.isnull()

0    False
1    False
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [54]:
x.fillna(2.5) #fill out missing records

0    2.0
1    4.0
2    5.0
3    3.0
4    2.5
5    3.0
6    2.5
dtype: float64

In [55]:
x.fillna(x.mean())

0    2.0
1    4.0
2    5.0
3    3.0
4    3.4
5    3.0
6    3.4
dtype: float64

In [56]:
#ffill() bfill()
x.ffill()

0    2.0
1    4.0
2    5.0
3    3.0
4    3.0
5    3.0
6    3.0
dtype: float64

In [57]:
x.bfill()

0    2.0
1    4.0
2    5.0
3    3.0
4    3.0
5    3.0
6    NaN
dtype: float64

In [58]:
# drop the Nan values
x.dropna()

0    2.0
1    4.0
2    5.0
3    3.0
5    3.0
dtype: float64

In [59]:
check = pd.Series(['alex','mathhew','john','Franz'])
check

0       alex
1    mathhew
2       john
3      Franz
dtype: object

In [60]:
check.map(lambda x:len(x)) #calculating the length of x

0    4
1    7
2    4
3    5
dtype: int64

In [None]:
#Series: one dimension
#DataFrame: 2 dimensional, tabular, sql table, csv/excel etc

In [62]:
dt = pd.DataFrame() #empty dataframe
dt

In [64]:
dt1 = [5,3,7,6,5,4] # a data frame
dt2 = pd.DataFrame(dt1) #copy from dt1 data Frame
dt2

Unnamed: 0,0
0,5
1,3
2,7
3,6
4,5
5,4


In [65]:
data = [['alex',30],['mathhew',45],['romain',56]]
dt3 = pd.DataFrame(data,columns=['name','age'])
dt3

Unnamed: 0,name,age
0,alex,30
1,mathhew,45
2,romain,56


In [73]:
data ={'name':['tom','alan','jack','rosy','jess','lilly'],'age':[34,76,34,67,40,70]} #how to rename the columns
dt4 = pd.DataFrame(data,index=['rank1','rank2','rank3','rank4','rank5','rank6']) #how to get the index
dt4

Unnamed: 0,name,age
rank1,tom,34
rank2,alan,76
rank3,jack,34
rank4,rosy,67
rank5,jess,40
rank6,lilly,70


In [74]:
#dataframe attributes
dt4.index #index in the dataframe

Index(['rank1', 'rank2', 'rank3', 'rank4', 'rank5', 'rank6'], dtype='object')

In [75]:
dt4.columns #columns in the dataframe

Index(['name', 'age'], dtype='object')

In [76]:
dt4.dtypes #type

name    object
age      int64
dtype: object

In [77]:
dt4.shape

(6, 2)

In [78]:
dt4.head() #first 5 rows

Unnamed: 0,name,age
rank1,tom,34
rank2,alan,76
rank3,jack,34
rank4,rosy,67
rank5,jess,40


In [79]:
dt4.tail() # 5last rows

Unnamed: 0,name,age
rank2,alan,76
rank3,jack,34
rank4,rosy,67
rank5,jess,40
rank6,lilly,70


In [80]:
dt4.tail(2)

Unnamed: 0,name,age
rank5,jess,40
rank6,lilly,70


In [81]:
dt4[2:5] #last range will be excluded last-1 (5-1=4)

Unnamed: 0,name,age
rank3,jack,34
rank4,rosy,67
rank5,jess,40


In [82]:
dt4['name'] #extract columns from df

rank1      tom
rank2     alan
rank3     jack
rank4     rosy
rank5     jess
rank6    lilly
Name: name, dtype: object

In [83]:
#another way
dt4.name

rank1      tom
rank2     alan
rank3     jack
rank4     rosy
rank5     jess
rank6    lilly
Name: name, dtype: object

In [85]:
#extracting more then 2 columns
df = pd.DataFrame(np.random.randint(0,50,20).reshape(5,4),index=list('abcde'),columns=list('pqrs'))
df

Unnamed: 0,p,q,r,s
a,4,49,47,32
b,26,30,12,42
c,17,23,47,23
d,29,13,21,11
e,38,23,37,3


In [88]:
list('abcde')

['a', 'b', 'c', 'd', 'e']

In [89]:
#df['p'] #another way to extract particular column
df.p

a     4
b    26
c    17
d    29
e    38
Name: p, dtype: int32

In [90]:
df['q':'r']

Unnamed: 0,p,q,r,s


In [None]:
#loc: label based index
#iloc position based index

In [93]:
#df.loc[row,col]
df.loc['a':'d','p':'r']

Unnamed: 0,p,q,r
a,4,49,47
b,26,30,12
c,17,23,47
d,29,13,21


In [94]:
#df.iloc[] / start position and go to, start position and go to...
df.iloc[0:4,0:3]

Unnamed: 0,p,q,r
a,4,49,47
b,26,30,12
c,17,23,47
d,29,13,21
