In [1]:
import pandas as pd
import numpy as np

In [2]:
s = pd.Series([1, 3, 4, np.nan, 5, 7])

In [3]:
print(s)

0    1.0
1    3.0
2    4.0
3    NaN
4    5.0
5    7.0
dtype: float64


In [4]:
sale_date = pd.date_range('20200501', periods=6)
print(sale_date)

DatetimeIndex(['2020-05-01', '2020-05-02', '2020-05-03', '2020-05-04',
               '2020-05-05', '2020-05-06'],
              dtype='datetime64[ns]', freq='D')


In [9]:
# Creating a DataFrame


df = pd.DataFrame((np.random.randn(6, 3)), index = sale_date, columns = list(["Mon", "Tue", "Wed"]))
print(df)

                 Mon       Tue       Wed
2020-05-01  0.761438  0.098537  0.771961
2020-05-02 -0.138873  0.110611  1.128360
2020-05-03  0.342659  0.066603  0.078950
2020-05-04  0.051416 -1.097302 -0.747096
2020-05-05  0.802672 -1.218945  1.317005
2020-05-06  1.096469 -0.031237 -0.629208


In [41]:
#Creating a DataFrame using a dictionary

df2 = pd.DataFrame({'A': 2.,
                    'B': pd.Timestamp("20200501"),
                    'C': pd.Series([1, 2, 3, 4], dtype='float32'),
                    'D': np.array([3] * 4, dtype ='int32'),
                    'E': pd.Categorical(["Intro", "Data", "Science", "Test"]),
                    'F': "foo"})

print(df2)

     A          B    C  D        E    F
0  2.0 2020-05-01  1.0  3    Intro  foo
1  2.0 2020-05-01  2.0  3     Data  foo
2  2.0 2020-05-01  3.0  3  Science  foo
3  2.0 2020-05-01  4.0  3     Test  foo


In [15]:
# passing a value in the parentheses slices the dataframe
print((df2.tail(2)))

     A          B    C  D        E    F
2  2.0 2020-05-01  3.0  3  Science  foo
3  2.0 2020-05-01  4.0  3     Test  foo


In [17]:
# df.head() first 5 by default
print(df2.head())
print(df2.head(2)) # first 2 example

     A          B    C  D        E    F
0  2.0 2020-05-01  1.0  3    Intro  foo
1  2.0 2020-05-01  2.0  3     Data  foo
2  2.0 2020-05-01  3.0  3  Science  foo
3  2.0 2020-05-01  4.0  3     Test  foo
     A          B    C  D      E    F
0  2.0 2020-05-01  1.0  3  Intro  foo
1  2.0 2020-05-01  2.0  3   Data  foo


In [19]:
# view the columns/ indexes
print(df2.columns)

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')


In [20]:
print(df.index)

DatetimeIndex(['2020-05-01', '2020-05-02', '2020-05-03', '2020-05-04',
               '2020-05-05', '2020-05-06'],
              dtype='datetime64[ns]', freq='D')


## Dataframe to Numpy Array


In [26]:
# Converting our first dataframe as it is all floating values it doesn't cost muchh space compared to df2 as it requires data to be copied.
print(df.to_numpy())

[[ 0.76143756  0.09853708  0.77196121]
 [-0.13887318  0.11061148  1.12835972]
 [ 0.34265883  0.06660349  0.07895026]
 [ 0.05141625 -1.09730232 -0.74709609]
 [ 0.80267245 -1.21894547  1.31700511]
 [ 1.09646945 -0.031237   -0.62920759]]


In [22]:
print(df2.to_numpy())

[[2.0 Timestamp('2020-05-01 00:00:00') 1.0 3 'Intro' 'foo']
 [2.0 Timestamp('2020-05-01 00:00:00') 2.0 3 'Data' 'foo']
 [2.0 Timestamp('2020-05-01 00:00:00') 3.0 3 'Science' 'foo']
 [2.0 Timestamp('2020-05-01 00:00:00') 4.0 3 'Test' 'foo']]


## Dataframe Information

In [27]:
# shows all the info about the dataframe

# No. of entries
# Amount of Columns
# Count of values per column
# Dtype of each column


print(df.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2020-05-01 to 2020-05-06
Freq: D
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Mon     6 non-null      float64
 1   Tue     6 non-null      float64
 2   Wed     6 non-null      float64
dtypes: float64(3)
memory usage: 192.0 bytes
None


In [25]:
# shows a quick statistic summary of the dataframe

print(df.describe())

            Mon       Tue       Wed
count  6.000000  6.000000  6.000000
mean   0.485964 -0.345289  0.319995
std    0.479311  0.632758  0.888766
min   -0.138873 -1.218945 -0.747096
25%    0.124227 -0.830786 -0.452168
50%    0.552048  0.017683  0.425456
75%    0.792364  0.090554  1.039260
max    1.096469  0.110611  1.317005


## Transforming Data 

In [30]:
# transpose Dataframe
print(df.T)

     2020-05-01  2020-05-02  2020-05-03  2020-05-04  2020-05-05  2020-05-06
Mon    0.761438   -0.138873    0.342659    0.051416    0.802672    1.096469
Tue    0.098537    0.110611    0.066603   -1.097302   -1.218945   -0.031237
Wed    0.771961    1.128360    0.078950   -0.747096    1.317005   -0.629208


In [33]:
# Sort by an Axis

# Axis:                  Ascending:
# 1 = Column              False = Descend
# 2 = Row                 True = Ascend


print(df.sort_index(axis=1, ascending= False))

                 Wed       Tue       Mon
2020-05-01  0.771961  0.098537  0.761438
2020-05-02  1.128360  0.110611 -0.138873
2020-05-03  0.078950  0.066603  0.342659
2020-05-04 -0.747096 -1.097302  0.051416
2020-05-05  1.317005 -1.218945  0.802672
2020-05-06 -0.629208 -0.031237  1.096469


In [40]:
# Sort by a Value

# you can select any column to sort by.
# you can also use ascending=.... to sort.S

# print(df.sort_values(by='C'))

## Selecting Data

In [48]:
# Slicing a Dataframe by index

# Slicing using [] selects rows. In this example [0:2] selects rows 1 to 2.

# Remember that in Python indexing starts from 0 not 1.

print(df[:])

                 Mon       Tue       Wed
2020-05-01  0.761438  0.098537  0.771961
2020-05-02 -0.138873  0.110611  1.128360
2020-05-03  0.342659  0.066603  0.078950
2020-05-04  0.051416 -1.097302 -0.747096
2020-05-05  0.802672 -1.218945  1.317005
2020-05-06  1.096469 -0.031237 -0.629208


In [50]:
# Slicing a Dataframe by index cont.

# You can also slice a dataframe by the row freatures. In this example we slice the rows by the dates in the index column.
print(df[:])

                 Mon       Tue       Wed
2020-05-01  0.761438  0.098537  0.771961
2020-05-02 -0.138873  0.110611  1.128360
2020-05-03  0.342659  0.066603  0.078950
2020-05-04  0.051416 -1.097302 -0.747096
2020-05-05  0.802672 -1.218945  1.317005
2020-05-06  1.096469 -0.031237 -0.629208
