In [1]:
# Exercise from Pandas 10 mins: https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html

#import the following libs
import numpy as np
import pandas as pd

#cmd to reformat cells
%load_ext nb_black

In [2]:
# Creating a Series by passing a list of values, letting pandas create a default integer index:
s = pd.Series([1, 3, 5, np.nan, 6, 8])

In [3]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
# Creating a DataFrame by passing a NumPy array, with a datetime index and labeled columns:
dates = pd.date_range("20130101", periods=6)

In [5]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [6]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))

In [7]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.009955,-1.713511,1.166578,-0.780483
2013-01-02,0.210069,-0.125465,1.206358,0.672644
2013-01-03,0.153311,0.00753,-0.084116,-2.869331
2013-01-04,-0.260789,0.596974,0.420847,-1.147171
2013-01-05,0.084986,3.129352,1.906084,-3.127187
2013-01-06,-0.840613,-0.594418,-0.757077,0.152805


In [10]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

<IPython.core.display.Javascript object>

In [11]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


<IPython.core.display.Javascript object>

In [12]:
# The columns of the resulting DataFrame have different dtypes.
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

<IPython.core.display.Javascript object>

In [16]:
# columns A, B, C, and D are automatically tab completed. E and F are there as well; the rest of the attributes have been truncated for brevity.
# pressing will output as below

### df2.<TAB>

# df2.A                  df2.bool
# df2.abs                df2.boxplot
# df2.add                df2.C
# df2.add_prefix         df2.clip
# df2.add_suffix         df2.columns
# df2.align              df2.copy
# df2.all                df2.count
# df2.any                df2.combine
# df2.append             df2.D
# df2.apply              df2.describe
# df2.applymap           df2.diff
# df2.B                  df2.duplicated

<IPython.core.display.Javascript object>

In [19]:
#example column a
df2.A

0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64

<IPython.core.display.Javascript object>

In [20]:
#example column b
df2.B

0   2013-01-02
1   2013-01-02
2   2013-01-02
3   2013-01-02
Name: B, dtype: datetime64[ns]

<IPython.core.display.Javascript object>

<h3>Viewing data</h3>

In [24]:
# Here is how to view the top and bottom rows of the frame:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.009955,-1.713511,1.166578,-0.780483
2013-01-02,0.210069,-0.125465,1.206358,0.672644
2013-01-03,0.153311,0.00753,-0.084116,-2.869331
2013-01-04,-0.260789,0.596974,0.420847,-1.147171
2013-01-05,0.084986,3.129352,1.906084,-3.127187


<IPython.core.display.Javascript object>

In [23]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-0.260789,0.596974,0.420847,-1.147171
2013-01-05,0.084986,3.129352,1.906084,-3.127187
2013-01-06,-0.840613,-0.594418,-0.757077,0.152805


<IPython.core.display.Javascript object>

In [25]:
# Display the index, columns:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

<IPython.core.display.Javascript object>

In [26]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

<IPython.core.display.Javascript object>

In [27]:
#summary of df table

Unnamed: 0,A,B,C,D
2013-01-01,-0.009955,-1.713511,1.166578,-0.780483
2013-01-02,0.210069,-0.125465,1.206358,0.672644
2013-01-03,0.153311,0.00753,-0.084116,-2.869331
2013-01-04,-0.260789,0.596974,0.420847,-1.147171
2013-01-05,0.084986,3.129352,1.906084,-3.127187
2013-01-06,-0.840613,-0.594418,-0.757077,0.152805


<IPython.core.display.Javascript object>

In [28]:
#DataFrame.to_numpy() gives a NumPy representation of the underlying data. 
#Note that this can be an expensive operation when your DataFrame has columns with different data types, which comes down to a fundamental
#difference between pandas and NumPy: NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column. 
#When you call DataFrame.to_numpy(), pandas will find the NumPy dtype that can hold all of the dtypes in the DataFrame. This may end up 
#being object, which requires casting every value to a Python object.

df.to_numpy()

#note: DataFrame.to_numpy() does not include the index or column labels in the output.

array([[-0.00995544, -1.71351059,  1.16657815, -0.78048267],
       [ 0.2100688 , -0.12546509,  1.20635775,  0.67264383],
       [ 0.15331119,  0.00753032, -0.08411576, -2.86933145],
       [-0.26078884,  0.59697412,  0.42084747, -1.1471715 ],
       [ 0.08498607,  3.12935212,  1.90608406, -3.12718652],
       [-0.84061323, -0.5944175 , -0.75707655,  0.15280526]])

<IPython.core.display.Javascript object>

In [29]:
# describe() shows a quick statistic summary of your data:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.110499,0.216744,0.643113,-1.183121
std,0.39404,1.623121,0.972127,1.55014
min,-0.840613,-1.713511,-0.757077,-3.127187
25%,-0.19808,-0.477179,0.042125,-2.438791
50%,0.037515,-0.058967,0.793713,-0.963827
75%,0.13623,0.449613,1.196413,-0.080517
max,0.210069,3.129352,1.906084,0.672644


<IPython.core.display.Javascript object>

In [30]:
# Transposing your data:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.009955,0.210069,0.153311,-0.260789,0.084986,-0.840613
B,-1.713511,-0.125465,0.00753,0.596974,3.129352,-0.594418
C,1.166578,1.206358,-0.084116,0.420847,1.906084,-0.757077
D,-0.780483,0.672644,-2.869331,-1.147171,-3.127187,0.152805


<IPython.core.display.Javascript object>

In [31]:
# Sorting by an axis:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.780483,1.166578,-1.713511,-0.009955
2013-01-02,0.672644,1.206358,-0.125465,0.210069
2013-01-03,-2.869331,-0.084116,0.00753,0.153311
2013-01-04,-1.147171,0.420847,0.596974,-0.260789
2013-01-05,-3.127187,1.906084,3.129352,0.084986
2013-01-06,0.152805,-0.757077,-0.594418,-0.840613


<IPython.core.display.Javascript object>

In [32]:
# Sorting by values:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-01,-0.009955,-1.713511,1.166578,-0.780483
2013-01-06,-0.840613,-0.594418,-0.757077,0.152805
2013-01-02,0.210069,-0.125465,1.206358,0.672644
2013-01-03,0.153311,0.00753,-0.084116,-2.869331
2013-01-04,-0.260789,0.596974,0.420847,-1.147171
2013-01-05,0.084986,3.129352,1.906084,-3.127187


<IPython.core.display.Javascript object>

In [34]:
df.sort_values(by="A")

Unnamed: 0,A,B,C,D
2013-01-06,-0.840613,-0.594418,-0.757077,0.152805
2013-01-04,-0.260789,0.596974,0.420847,-1.147171
2013-01-01,-0.009955,-1.713511,1.166578,-0.780483
2013-01-05,0.084986,3.129352,1.906084,-3.127187
2013-01-03,0.153311,0.00753,-0.084116,-2.869331
2013-01-02,0.210069,-0.125465,1.206358,0.672644


<IPython.core.display.Javascript object>