In [1]:
import numpy as np
import pandas as pd

## Attributes of Pandas ojbects

In [3]:
df = pd.DataFrame(np.random.randn(8,3), index=range(8),
            columns = ['A', 'B', 'C'])

In [4]:
df.columns = [x.lower() for x in df.columns]

In [5]:
df

Unnamed: 0,a,b,c
0,-1.210368,-0.112408,-2.313287
1,-0.629016,0.172335,-0.612788
2,1.215655,-1.95758,0.59996
3,-0.360736,0.361947,-0.370015
4,-0.930611,0.101266,0.859648
5,2.436476,-0.869312,-0.445892
6,0.077938,-0.343286,0.392544
7,1.73073,2.130597,-1.793521


In [6]:
df['a'].array # obtain data inside array

<NumpyExtensionArray>
[-1.2103682254647654, -0.6290158320488817,   1.215654636870989,
 -0.3607360824317242, -0.9306113786594737,   2.436475582329587,
 0.07793818370657812,  1.7307300157902392]
Length: 8, dtype: float64

# Couting values in Series

In [7]:
data = np.random.randint(0, 7, size=50)

In [8]:
data

array([2, 5, 5, 2, 1, 1, 2, 6, 3, 6, 0, 6, 1, 1, 0, 4, 6, 4, 2, 4, 1, 0,
       1, 1, 2, 1, 3, 3, 0, 4, 4, 5, 6, 4, 5, 3, 4, 6, 3, 3, 0, 3, 6, 6,
       3, 4, 5, 5, 4, 1])

In [9]:
s = pd.Series(data)

In [11]:
s.value_counts() # returns the value count

1    9
4    9
6    8
3    8
5    6
2    5
0    5
Name: count, dtype: int64

In [12]:
s5 = pd.Series([1,1,3,3,3,5,5,7,7,7])

In [13]:
s5.mode() # find the mode

0    3
1    7
dtype: int64

# .dt and .str accessors

In [14]:
# datetime
s = pd.Series(pd.date_range('20130101 09:10:12', periods=4))

In [15]:
s

0   2013-01-01 09:10:12
1   2013-01-02 09:10:12
2   2013-01-03 09:10:12
3   2013-01-04 09:10:12
dtype: datetime64[ns]

In [16]:
s.dt.hour # access hour

0    9
1    9
2    9
3    9
dtype: int32

In [19]:
s.dt.second # seconds

0    12
1    12
2    12
3    12
dtype: int32

In [20]:
s.dt.day # day

0    1
1    2
2    3
3    4
dtype: int32

In [21]:
s.dt.dayofweek # day of the week

0    1
1    2
2    3
3    4
dtype: int32

In [22]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], dtype="string")

In [23]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5    <NA>
6    caba
7     dog
8     cat
dtype: string

# Sorting

In [24]:
df = pd.DataFrame({
    'one':pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
    'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
    'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})

In [26]:
unsorted_df = df.reindex(index=['a', 'd', 'c', 'b'],
                         columns = ['three', 'two', 'one'])

In [27]:
unsorted_df

Unnamed: 0,three,two,one
a,,-1.082326,-0.360146
d,0.53557,-1.686191,
c,2.128461,2.303066,-0.014249
b,-1.542814,-1.458757,1.89479


In [28]:
# Sort DataFrame by index
unsorted_df.sort_index()

Unnamed: 0,three,two,one
a,,-1.082326,-0.360146
b,-1.542814,-1.458757,1.89479
c,2.128461,2.303066,-0.014249
d,0.53557,-1.686191,


In [29]:
#Sort DataFrame by index
unsorted_df.sort_index(ascending=False)

Unnamed: 0,three,two,one
d,0.53557,-1.686191,
c,2.128461,2.303066,-0.014249
b,-1.542814,-1.458757,1.89479
a,,-1.082326,-0.360146


In [30]:
#Sort DataFrame by column names
unsorted_df.sort_index(axis=1)

Unnamed: 0,one,three,two
a,-0.360146,,-1.082326
d,,0.53557,-1.686191
c,-0.014249,2.128461,2.303066
b,1.89479,-1.542814,-1.458757


In [31]:
# Sort Series by index
unsorted_df['three'].sort_index()

a         NaN
b   -1.542814
c    2.128461
d    0.535570
Name: three, dtype: float64

### sort by values

In [32]:
df1 = pd.DataFrame({'one': [2,1,1,1],
                    'two': [1,3,2,4],
                    'three':[5,4,3,2]})

In [33]:
# Sort DataFrame by column "two"
df1.sort_values(by='two')

Unnamed: 0,one,two,three
0,2,1,5
2,1,2,3
1,1,3,4
3,1,4,2


In [34]:
# Sort DataFrame by columns "one" and "two"
df1[['one', 'two', 'three']].sort_values(by=['one','two'])

Unnamed: 0,one,two,three
2,1,2,3
1,1,3,4
3,1,4,2
0,2,1,5


In [35]:
s[2] = np.nan

In [36]:
s.sort_values()

0       A
3    Aaba
1       B
4    Baca
6    CABA
8     cat
7     dog
2    <NA>
5    <NA>
dtype: string

In [37]:
s.sort_values(na_position='first')

2    <NA>
5    <NA>
0       A
3    Aaba
1       B
4    Baca
6    CABA
8     cat
7     dog
dtype: string

In [38]:
# Use the name of the index to srot by both an index and a column
# Build MultiIndex
idx = pd.MultiIndex.from_tuples([('a',1),('a',2),('a',2),
                                 ('b',2),('b',1),('b',1)])

In [39]:
idx.names = ['first', 'second']

In [40]:
# Build DataFrame
df_multi = pd.DataFrame({'A': np.arange(6, 0, -1)},
                        index = idx)

In [41]:
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,A
first,second,Unnamed: 2_level_1
a,1,6
a,2,5
a,2,4
b,2,3
b,1,2
b,1,1


In [42]:
df_multi.sort_values(by=['second', 'A'])

Unnamed: 0_level_0,Unnamed: 1_level_0,A
first,second,Unnamed: 2_level_1
b,1,1
b,1,2
a,1,6
b,2,3
a,2,4
a,2,5
