### Pandas Series

In [1]:
import numpy as np
import pandas as pd

In [2]:
array=np.arange(5)
array

array([0, 1, 2, 3, 4])

In [3]:
series=pd.Series(array,name='test array')
series

0    0
1    1
2    2
3    3
4    4
Name: test array, dtype: int32

In [4]:
series.values

array([0, 1, 2, 3, 4])

In [5]:
series.values.mean()

2.0

In [6]:
series.index

RangeIndex(start=0, stop=5, step=1)

In [7]:
series.index=[10,20,30,40,50]

In [8]:
series.index

Int64Index([10, 20, 30, 40, 50], dtype='int64')

In [9]:
series.name

'test array'

In [10]:
series.name='crazy'

In [11]:
series.name

'crazy'

In [12]:
series.dtype

dtype('int32')

In [13]:
series

10    0
20    1
30    2
40    3
50    4
Name: crazy, dtype: int32

### Pandas Data Types and Type Conversion

In [14]:
series.astype('float')

10    0.0
20    1.0
30    2.0
40    3.0
50    4.0
Name: crazy, dtype: float64

In [15]:
series.astype('bool')

10    False
20     True
30     True
40     True
50     True
Name: crazy, dtype: bool

In [16]:
series.astype('object')

10    0
20    1
30    2
40    3
50    4
Name: crazy, dtype: object

In [17]:
series

10    0
20    1
30    2
40    3
50    4
Name: crazy, dtype: int32

In [18]:
series.astype('string')

10    0
20    1
30    2
40    3
50    4
Name: crazy, dtype: string

### The Series Index and custom slices

In [19]:
my_series=pd.Series(range(5))
my_series

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [20]:
my_series[3]

3

In [21]:
my_series[1:3]   # stop point is exclusive in case of default index

1    1
2    2
dtype: int64

In [22]:
my_series[::2]

0    0
2    2
4    4
dtype: int64

In [23]:
my_series.index=['a','b','c','d','e']

In [24]:
my_series

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [25]:
my_series['a':'c']  # stop point is inclusive in case of custom index

a    0
b    1
c    2
dtype: int64

In [25]:
my_series[::2]

a    0
c    2
e    4
dtype: int64

### iloc[] Accessor

In [26]:
my_series

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [27]:
my_series.iloc[2]

2

In [28]:
my_series.iloc[-3:]

c    2
d    3
e    4
dtype: int64

In [29]:
my_series.iloc[[1,3]]

b    1
d    3
dtype: int64

In [30]:
my_series.loc['c']

2

In [31]:
my_series.loc['b':'d']

b    1
c    2
d    3
dtype: int64

In [32]:
my_series.index=[0,2,3,100,5]

In [33]:
my_series.loc[0:5]

0      0
2      1
3      2
100    3
5      4
dtype: int64

In [34]:
my_series.loc[my_series != 2]

0      0
2      1
100    3
5      4
dtype: int64

In [35]:
my_series=my_series.reset_index(drop=True)  # by default, the existing index will become a new column in a dataframe
# so provide drop=True will solve this problem

In [36]:
my_series.loc[:3]

0    0
1    1
2    2
3    3
dtype: int64

In [37]:
my_series.loc[my_series !=2]

0    0
1    1
3    3
4    4
dtype: int64

###  Duplicate Index values & Resetting the index

In [38]:
my_series=pd.Series(range(5),index=['a','b','c','b','d'])
my_series

a    0
b    1
c    2
b    3
d    4
dtype: int64

In [39]:
my_series.loc['b']

b    1
b    3
dtype: int64

In [40]:
my_series.loc['b'][1]

3

In [41]:
my_series.reset_index()

Unnamed: 0,index,0
0,a,0
1,b,1
2,c,2
3,b,3
4,d,4


In [42]:
my_series

a    0
b    1
c    2
b    3
d    4
dtype: int64

In [43]:
my_series.reset_index(drop=True)

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [44]:
my_series.reset_index(drop=True).iloc[2:4]

2    2
3    3
dtype: int64

In [45]:
my_series.reset_index(drop=True).loc[2:4]

2    2
3    3
4    4
dtype: int64

In [46]:
my_series

a    0
b    1
c    2
b    3
d    4
dtype: int64

### Filtering Series

In [47]:
my_series

a    0
b    1
c    2
b    3
d    4
dtype: int64

In [48]:
my_series[my_series<3]

a    0
b    1
c    2
dtype: int64

In [49]:
mask=(my_series>2) & (my_series.index=='b')

In [50]:
my_series.loc[mask]

b    3
dtype: int64

In [51]:
my_series.index

Index(['a', 'b', 'c', 'b', 'd'], dtype='object')

In [52]:
my_series.index=='b'

array([False,  True, False,  True, False])

In [53]:
my_series>2

a    False
b    False
c    False
b     True
d     True
dtype: bool

#### membership operator

In [54]:
my_series.isin([2,3])

a    False
b    False
c     True
b     True
d    False
dtype: bool

In [55]:
~my_series.isin([2,3])   #  not in  tilde symbol(~) inverts any logical test

a     True
b     True
c    False
b    False
d     True
dtype: bool

In [56]:
my_series.loc[(my_series!=2)]

a    0
b    1
b    3
d    4
dtype: int64

In [57]:
my_series.loc[~(my_series!=2)]

c    2
dtype: int64

### Sorting Series

In [58]:
my_series.sort_values()

a    0
b    1
c    2
b    3
d    4
dtype: int64

In [59]:
my_series.sort_values(ascending=False)

d    4
b    3
c    2
b    1
a    0
dtype: int64

In [60]:
my_series.sort_index()

a    0
b    1
b    3
c    2
d    4
dtype: int64

In [61]:
my_series.sort_index(ascending=False)

d    4
c    2
b    1
b    3
a    0
dtype: int64

### Numeric Series Operators

In [62]:
#  +,-,*,/,//,%,**

In [63]:
my_series=pd.Series(range(5),index=['a','b','c','b','d'])
my_series

a    0
b    1
c    2
b    3
d    4
dtype: int64

In [64]:
my_series=my_series.astype('float')

In [65]:
my_series

a    0.0
b    1.0
c    2.0
b    3.0
d    4.0
dtype: float64

In [66]:
my_series[1]=np.nan
my_series

a    0.0
b    NaN
c    2.0
b    3.0
d    4.0
dtype: float64

In [67]:
my_series+1

a    1.0
b    NaN
c    3.0
b    4.0
d    5.0
dtype: float64

In [68]:
my_series.add(1,fill_value=10)    # for filling NaN values

a     1.0
b    11.0
c     3.0
b     4.0
d     5.0
dtype: float64

In [69]:
my_series2=my_series.add(1,fill_value=0)
my_series2

a    1.0
b    1.0
c    3.0
b    4.0
d    5.0
dtype: float64

In [87]:
my_series+my_series2

a    1.0
b    NaN
c    5.0
b    7.0
d    9.0
dtype: float64

In [70]:
my_series.add(my_series2,fill_value=10)

a     1.0
b    11.0
c     5.0
b     7.0
d     9.0
dtype: float64

### Text Series Operations

In [71]:
my_series3=pd.Series(['day 0','day 1','day 2','day 3','day 4'])
my_series3

0    day 0
1    day 1
2    day 2
3    day 3
4    day 4
dtype: object

In [72]:
my_series3.str.contains('1')

0    False
1     True
2    False
3    False
4    False
dtype: bool

In [73]:
my_series3.str.upper()

0    DAY 0
1    DAY 1
2    DAY 2
3    DAY 3
4    DAY 4
dtype: object

In [74]:
my_series3.str.strip('day ')

0    0
1    1
2    2
3    3
4    4
dtype: object

In [75]:
my_series3.str[-1].astype('int')

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [76]:
my_series3.str.split(' ')

0    [day, 0]
1    [day, 1]
2    [day, 2]
3    [day, 3]
4    [day, 4]
dtype: object

In [80]:
type(my_series3.str.split(' '))

pandas.core.series.Series

In [77]:
my_series3.str.split(' ')[0][0]

'day'

In [78]:
my_series3.str.split(' ',expand=True)

Unnamed: 0,0,1
0,day,0
1,day,1
2,day,2
3,day,3
4,day,4


In [79]:
type(my_series3.str.split(' ',expand=True))

pandas.core.frame.DataFrame

### Numerical Series Aggregation

In [81]:
transactions=pd.read_csv("../retail/transactions.csv")
transactions_series=pd.Series(transactions["transactions"])
transactions_series.iloc[:5]

0     770
1    2111
2    2358
3    3487
4    1922
Name: transactions, dtype: int64

In [82]:
transactions_series.count()

83488

In [83]:
transactions_series.sum()

141478945

In [88]:
transactions_series.quantile([.25,.5,.75,1.0])

0.25    1046.0
0.50    1393.0
0.75    2079.0
1.00    8359.0
Name: transactions, dtype: float64

In [89]:
transactions_series.max()

8359

In [90]:
transactions_series.iloc[:5]

0     770
1    2111
2    2358
3    3487
4    1922
Name: transactions, dtype: int64

In [108]:
transactions_series.iloc[:5].quantile(0.4)

2035.4

In [91]:
transactions_series.iloc[:5].quantile(0.4,interpolation='nearest')
# if you want to value within your data then use interpolation='nearest'

2111

### Categorical Series Aggregation

In [92]:
#  .unique(),.nunique(),.value_counts()

In [93]:
my_series3

0    day 0
1    day 1
2    day 2
3    day 3
4    day 4
dtype: object

In [94]:
my_series3[3]='day 1'

In [95]:
my_series3

0    day 0
1    day 1
2    day 2
3    day 1
4    day 4
dtype: object

In [96]:
my_series3.value_counts()

day 1    2
day 2    1
day 0    1
day 4    1
dtype: int64

In [116]:
my_series3.value_counts(normalize=True)

day 1    0.4
day 4    0.2
day 0    0.2
day 2    0.2
dtype: float64

In [97]:
my_series3.nunique()

4

In [98]:
my_series3.unique()

array(['day 0', 'day 1', 'day 2', 'day 4'], dtype=object)

### Missing Data Representaion in Pandas

In [99]:
sales=[0,5,155,np.nan,518]
sales_series=pd.Series(sales,name='sales',dtype='float16')
sales_series

0      0.0
1      5.0
2    155.0
3      NaN
4    518.0
Name: sales, dtype: float16

In [100]:
sales=[0,5,155,pd.NA,518]
sales_series2=pd.Series(sales,name='sales')
sales_series2

0       0
1       5
2     155
3    <NA>
4     518
Name: sales, dtype: object

#### we can convert pd.NA in Int64 data type but np.nan not.

In [102]:
sales_series2.astype('int64')   # int64(np)   #  Int64(pd)  # both are different

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NAType'

In [103]:
sales_series2.astype('Int64')   # int64(np)   #  Int64(pd)  # both are different

0       0
1       5
2     155
3    <NA>
4     518
Name: sales, dtype: Int64

### Identifying missing data

In [104]:
sales_series.isna()

0    False
1    False
2    False
3     True
4    False
Name: sales, dtype: bool

In [105]:
sales_series.isna().sum()

1

In [106]:
sales_series.value_counts()

518.0    1
155.0    1
5.0      1
0.0      1
Name: sales, dtype: int64

In [107]:
sales_series.value_counts(dropna=False)

518.0    1
NaN      1
155.0    1
5.0      1
0.0      1
Name: sales, dtype: int64

### Handling Missing data

In [108]:
# .dropna(), .fillna()

In [109]:
sales_series.dropna()

0      0.0
1      5.0
2    155.0
4    518.0
Name: sales, dtype: float16

In [110]:
sales_series.fillna(100)

0      0.0
1      5.0
2    155.0
3    100.0
4    518.0
Name: sales, dtype: float16

In [111]:
sales_series.fillna(sales_series.mean())

0      0.0
1      5.0
2    155.0
3    169.5
4    518.0
Name: sales, dtype: float16

In [112]:
sales_series

0      0.0
1      5.0
2    155.0
3      NaN
4    518.0
Name: sales, dtype: float16

In [113]:
sales_series.dropna()

0      0.0
1      5.0
2    155.0
4    518.0
Name: sales, dtype: float16

In [114]:
sales_series.dropna().reset_index(drop=True)

0      0.0
1      5.0
2    155.0
3    518.0
Name: sales, dtype: float16

### Applying custom functions to Series

In [115]:
sales_series

0      0.0
1      5.0
2    155.0
3      NaN
4    518.0
Name: sales, dtype: float16

In [116]:
def discount(price):
    if price>150:
        return round(price*0.9,2)
    return price

In [117]:
sales_series.apply(discount)

0      0.0
1      5.0
2    139.5
3      NaN
4    466.2
Name: sales, dtype: float64

In [118]:
sales_series.apply(lambda x:round(x*0.9,2) if x>150 else x)

0      0.0
1      5.0
2    139.5
3      NaN
4    466.2
Name: sales, dtype: float64

In [119]:
def search(string,looking_for):
    if looking_for in string:
        return 'Found It!'
    return 'Nope!'

In [120]:
my_series3.str[-1]

0    0
1    1
2    2
3    1
4    4
dtype: object

In [150]:
my_series3.str[-1].apply(search,args='2')

0        Nope!
1        Nope!
2    Found It!
3        Nope!
4        Nope!
dtype: object

In [121]:
my_series3.str[-1].apply(lambda x: 'Found It!' if '2' in x else 'Nope!')

0        Nope!
1        Nope!
2    Found It!
3        Nope!
4        Nope!
dtype: object

In [125]:
np.where(my_series3.str.contains('2'), "Found It!", "Nope!")

array(['Nope!', 'Nope!', 'Found It!', 'Nope!', 'Nope!'], dtype='<U9')

### Pandas where()  VS Numpy where()

In [126]:
sales_series

0      0.0
1      5.0
2    155.0
3      NaN
4    518.0
Name: sales, dtype: float16

In [127]:
sales_series.where(sales_series>10,sales_series*10)

0      0.0
1     50.0
2    155.0
3      NaN
4    518.0
Name: sales, dtype: float16

In [128]:
sales_series.where(
    sales_series>10,sales_series*10).where(
    sales_series<150,0)

0     0.0
1    50.0
2     0.0
3     0.0
4     0.0
Name: sales, dtype: float16

In [129]:
string_series=my_series3.str[-1]
string_series

0    0
1    1
2    2
3    1
4    4
dtype: object

In [130]:
string_series.where(
string_series.str.contains('2'),"Nope!").where(
~string_series.str.contains('2'),"Found It!")

0        Nope!
1        Nope!
2    Found It!
3        Nope!
4        Nope!
dtype: object

In [131]:
np.where(string_series.str.contains("2"),"Found It!","Nope!")

array(['Nope!', 'Nope!', 'Found It!', 'Nope!', 'Nope!'], dtype='<U9')

In [132]:
pd.Series(np.where(string_series.str.contains("2"),"Found It!","Nope!"))

0        Nope!
1        Nope!
2    Found It!
3        Nope!
4        Nope!
dtype: object