### This notebook covers:
1. Introduction to pandas first data structure: Series
2. Attributes and Methods (focus: selecting from series)

### Revision:
* Series: 
    - 1D labeled array of any data type, 
    - mixed data types allowed, 
    - dtype('O') = object for hetrogeneous data type in numpy
    - s = pd.Series(data=list_of_values, index=list_of_Index(optional), dtype='string/float')
* Attributes:
    - s.dtype, s.name, s.size, s.shape, s.ndim
* Index and RangeIndex
*  Accessing elements: 
    - head(n), tail(n)
    - By Index: s[3], s[5:], s[-7:]
    - By Label: s['A'], s['A': 'F'], s.A, s.H
    - By loc: s.loc['A'], s.loc['A':'D'], s.loc[['A', 'C', 'F']]
    - By iloc: s.iloc[3], s.iloc[3:8], s.iloc[[2,5,6]]
    - By masking: s.loc[[True if i%2==0 else False for i in range(s.size)]]
    - Using callables with loc/iloc: any function which returns list of indexes/labels or boolean mask
    - get method: returns default value if label/index doesn't exist. s.get('f',default='Value does not exist')

In [2]:
# Series are - 
# one dimensional labeled arrays of any data type
# Sequence of values with associated labels

In [1]:
import pandas as pd

In [2]:
students = ['Chetan','John','Mak']
pd.Series(data=students)   # parameter = data, argument = students

0    Chetan
1      John
2       Mak
dtype: object

In [4]:
ages = [18,25,33]
s = pd.Series(ages)
print(s, s.index, type(s.index))
print(s.name, s.dtype, s.ndim, s.size, s.shape)

0    18
1    25
2    33
dtype: int64 RangeIndex(start=0, stop=3, step=1) <class 'pandas.core.indexes.range.RangeIndex'>
None int64 1 3 (3,)


In [7]:
heights = [167.3, 172.6, 170]
pd.Series(heights)

0    167.3
1    172.6
2    170.0
dtype: float64

In [10]:
mixed = [True,'Hello',{1: 'value'}]
pd.Series(mixed)                       # mixed data types are allowed in series

0            True
1           Hello
2    {1: 'value'}
dtype: object

In [13]:
list_of_books = ['Zero to one', 'Fooled by randomness', 'Sapiens']
list_a = pd.Series(list_of_books)
print(list_a)

0             Zero to one
1    Fooled by randomness
2                 Sapiens
dtype: object


In [19]:
dict_of_books = {0:'Zero to one',1:'Fooled by randomness', 2: 'Sapiens'}
list_b = pd.Series(dict_of_books)
print(list_b)
print(list_a.equals(list_b))

0             Zero to one
1    Fooled by randomness
2                 Sapiens
dtype: object
True


In [20]:
print(pd.Series(45))
pd.Series('example') # pandas is not critically dependent on labels or implied labels to be provided as inputs.

0    45
dtype: int64


0    example
dtype: object

In [25]:
# dtype attribute
print(pd.Series(data=ages, dtype='float'))
pd.Series('hello').dtype

0    18.0
1    25.0
2    33.0
dtype: float64


dtype('O')

In [26]:
# dtype('o')
# numpy expects homogeneous data type - float
# strings are variable length - heterogeneous
# numpy stores object reference/pointer to object in memory.

#### Index and RangeIndex

In [29]:
list_a = pd.Series(data=list_of_books, index=['book1','book2','book3'], dtype='string')

In [30]:
list_a.index

Index(['book1', 'book2', 'book3'], dtype='object')

In [31]:
list_b = pd.Series(data=list_of_books, dtype='string')
list_b.index

RangeIndex(start=0, stop=3, step=1)

In [32]:
list(pd.RangeIndex(start=10,stop=-10,step=-1))

[10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9]

In [36]:
list_a = pd.Series(data=list_of_books, index=list(pd.RangeIndex(start=4,stop=7,step=1)), dtype='string')
list_a

4             Zero to one
5    Fooled by randomness
6                 Sapiens
dtype: string

In [40]:
print(list_a.size)
print(list_a.dtype)
print(list_a.name)
list_a.name = 'My Books'
print(list_a.name)
list_a.equals(list_b)

3
string
None
My Books


False

In [43]:
# Challenge 1:
actor_names = ['Hritik', 'Sushant', 'Ritesh','Salman']
actor_ages = [45,35, 40, 50]
actor_series= pd.Series(data=actor_ages, index=actor_names,name='actors')
print(actor_series)
actors = pd.Series({x:y for x,y in zip(actor_names,actor_ages)})
print(actors)

Hritik     45
Sushant    35
Ritesh     40
Salman     50
Name: actors, dtype: int64
Hritik     45
Sushant    35
Ritesh     40
Salman     50
dtype: int64


#### head() and tail() methods

In [45]:
int_series = pd.Series(range(50))
print(int_series)
print(int_series.size)
print(int_series.tail(5))   # gives last 5 rows by default
print(int_series.head(5))   # gives first 5 rows by default

0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
12    12
13    13
14    14
15    15
16    16
17    17
18    18
19    19
20    20
21    21
22    22
23    23
24    24
25    25
26    26
27    27
28    28
29    29
30    30
31    31
32    32
33    33
34    34
35    35
36    36
37    37
38    38
39    39
40    40
41    41
42    42
43    43
44    44
45    45
46    46
47    47
48    48
49    49
dtype: int64
50
45    45
46    46
47    47
48    48
49    49
dtype: int64
0    0
1    1
2    2
3    3
4    4
dtype: int64


#### Accessing elements by index position

In [55]:
from string import ascii_lowercase, ascii_uppercase
letters = list(ascii_lowercase)
alphabet = pd.Series(letters)
alphabet.size

26

In [49]:
alphabet.head()

0    a
1    b
2    c
3    d
4    e
dtype: object

In [66]:
alphabet[1], alphabet[4]

('b', 'e')

In [60]:
# find 11th letter
print(alphabet[10])
# find first 3 letters
print(alphabet[:3])
# get 6-10th letters
print(alphabet[5:10])
# what are the last 6 letters
print(alphabet[-6:])

k
0    a
1    b
2    c
dtype: object
5    f
6    g
7    h
8    i
9    j
dtype: object
20    u
21    v
22    w
23    x
24    y
25    z
dtype: object


#### Accessing element by label

In [56]:
labeled_alphabet = pd.Series(data=list(ascii_lowercase),index=list(ascii_uppercase))

In [68]:
labeled_alphabet['A'], labeled_alphabet.A  # slicing not possible with dot notation

('a', 'a')

In [61]:
# find 11th letter
print(labeled_alphabet['K'])
# find first 3 letters
print(labeled_alphabet[:'C'])
# get 6-10th letters
print(labeled_alphabet['F':'J'])
# what are the last 6 letters
print(labeled_alphabet['U':])

k
A    a
B    b
C    c
dtype: object
F    f
G    g
H    h
I    i
J    j
dtype: object
U    u
V    v
W    w
X    x
Y    y
Z    z
dtype: object


#### add_prefix() and add_suffix()

In [63]:
alphabet.add_prefix('label_')    # not changing original alphabet

label_0     a
label_1     b
label_2     c
label_3     d
label_4     e
label_5     f
label_6     g
label_7     h
label_8     i
label_9     j
label_10    k
label_11    l
label_12    m
label_13    n
label_14    o
label_15    p
label_16    q
label_17    r
label_18    s
label_19    t
label_20    u
label_21    v
label_22    w
label_23    x
label_24    y
label_25    z
dtype: object

In [64]:
alphabet.add_suffix('_label')

0_label     a
1_label     b
2_label     c
3_label     d
4_label     e
5_label     f
6_label     g
7_label     h
8_label     i
9_label     j
10_label    k
11_label    l
12_label    m
13_label    n
14_label    o
15_label    p
16_label    q
17_label    r
18_label    s
19_label    t
20_label    u
21_label    v
22_label    w
23_label    x
24_label    y
25_label    z
dtype: object

#### loc

In [69]:
labeled_alphabet['A':'C']

A    a
B    b
C    c
dtype: object

In [71]:
labeled_alphabet.loc['A':'C']     # loc is a bit faster than normal slicing

A    a
B    b
C    c
dtype: object

#### Masks:

In [73]:
labeled_alphabet[:3].loc[[True,True,False]]

A    a
B    b
dtype: object

In [76]:
alpha_mask = []
for i in range(26):
    if alphabet[i] in 'aeiou':
        alpha_mask.append(True)
    else:
        alpha_mask.append(False)

alphabet.loc[alpha_mask]

0     a
4     e
8     i
14    o
20    u
dtype: object

In [77]:
alphabet.loc[[True if alphabet[i] in 'aieou' else False for i in range(26)]]

0     a
4     e
8     i
14    o
20    u
dtype: object

#### iloc:

In [78]:
# loc = indexing by labels
# iloc = indexing by position

In [80]:
print(alphabet.iloc[0])
print(alphabet.iloc[1:3])
print(alphabet.iloc[[1,10,2]])

a
1    b
2    c
dtype: object
1     b
10    k
2     c
dtype: object


#### Using callables with loc and iloc

In [81]:
# A single argument function that returns indexing output

In [84]:
labeled_alphabet.loc[lambda x: ['A','C']]

A    a
C    c
dtype: object

In [89]:
def every_fifth(x):
    return [True  if (n+1)%5==0 else False for n in range(x.size)]

In [90]:
alphabet.iloc[every_fifth]

4     e
9     j
14    o
19    t
24    y
dtype: object

#### get() method

In [96]:
# gives default value if index/label doesn't exist

In [98]:
print(labeled_alphabet.get('A', default = None))
print(labeled_alphabet.get('SS'))
print(labeled_alphabet.get('SS', default = 'Provided label does not exist.'))
print(labeled_alphabet.get(4))

a
None
Provided label does not exist.
e


In [101]:
# Challange:

squares = pd.Series([i**2 for i in range(100)])
print(squares[-3:])
print(squares.tail(3))
print(squares.tail(3).equals(squares[-3:]))
squares.tail(3)==squares[-3:]

97    9409
98    9604
99    9801
dtype: int64
97    9409
98    9604
99    9801
dtype: int64
True


97    True
98    True
99    True
dtype: bool