In [2]:
import numpy as np
import pandas as pd


In [3]:
# Series Data Structures in Pandas is used to create one dimensional labeled array
s = pd.Series([1,2,3,np.nan,6,8])
s


0    1.0
1    2.0
2    3.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [6]:
#Data frames are datastructures that can hold 2D array or tables with rows and columns
dates = pd.date_range("20130101", periods=6)
dates



DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [8]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("ABCD"))

df


Unnamed: 0,A,B,C,D
2013-01-01,-0.526508,0.630103,-0.321195,-0.1478
2013-01-02,1.284521,1.31686,0.812206,1.472196
2013-01-03,0.673191,-0.700436,0.543318,0.38929
2013-01-04,1.430661,0.740514,0.235313,-0.988487
2013-01-05,0.747896,1.522282,-0.18526,-0.553524
2013-01-06,0.29211,1.046643,0.079365,0.231245


In [9]:
df2 = pd.DataFrame(
    
        {
            "A": 1.0,
            "B": pd.Timestamp("20130102"),
            "C": pd.Series(1, index=list(range(4)), dtype="float32"),
            "D": np.array([3] * 4, dtype="int32"),
            "E": pd.Categorical(["test", "train", "test", "train"]),
            "F": "foo",
        }
)
df2



Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [10]:
df2.dtypes


A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

# Viewing data


In [11]:
#will produce the top 10 latest data
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.526508,0.630103,-0.321195,-0.1478
2013-01-02,1.284521,1.31686,0.812206,1.472196
2013-01-03,0.673191,-0.700436,0.543318,0.38929
2013-01-04,1.430661,0.740514,0.235313,-0.988487
2013-01-05,0.747896,1.522282,-0.18526,-0.553524


In [12]:
#will Return the index
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [None]:
#Will Return columns  of the dataframe
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [15]:
df.to_numpy() # Will convert the dataframe into a numpy array

array([[-0.52650844,  0.63010279, -0.3211952 , -0.14779965],
       [ 1.28452109,  1.31686012,  0.8122058 ,  1.472196  ],
       [ 0.67319092, -0.70043583,  0.54331785,  0.38928977],
       [ 1.43066111,  0.74051434,  0.23531281, -0.98848652],
       [ 0.74789621,  1.52228203, -0.1852598 , -0.55352449],
       [ 0.29211013,  1.04664315,  0.07936474,  0.23124538]])

In [16]:
#Describe shows statistic summary of your data
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.650312,0.759328,0.193958,0.067153
std,0.711959,0.790374,0.430886,0.854478
min,-0.526508,-0.700436,-0.321195,-0.988487
25%,0.38738,0.657706,-0.119104,-0.452093
50%,0.710544,0.893579,0.157339,0.041723
75%,1.150365,1.249306,0.466317,0.349779
max,1.430661,1.522282,0.812206,1.472196


In [17]:
# We can Transpose the data using 
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.526508,1.284521,0.673191,1.430661,0.747896,0.29211
B,0.630103,1.31686,-0.700436,0.740514,1.522282,1.046643
C,-0.321195,0.812206,0.543318,0.235313,-0.18526,0.079365
D,-0.1478,1.472196,0.38929,-0.988487,-0.553524,0.231245


In [18]:
#Dataframe sort_index sorts by an axis
df.sort_index(axis=1, ascending=False)


Unnamed: 0,D,C,B,A
2013-01-01,-0.1478,-0.321195,0.630103,-0.526508
2013-01-02,1.472196,0.812206,1.31686,1.284521
2013-01-03,0.38929,0.543318,-0.700436,0.673191
2013-01-04,-0.988487,0.235313,0.740514,1.430661
2013-01-05,-0.553524,-0.18526,1.522282,0.747896
2013-01-06,0.231245,0.079365,1.046643,0.29211


In [19]:
#Dataframe sort values will sort by the specified value
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-03,0.673191,-0.700436,0.543318,0.38929
2013-01-01,-0.526508,0.630103,-0.321195,-0.1478
2013-01-04,1.430661,0.740514,0.235313,-0.988487
2013-01-06,0.29211,1.046643,0.079365,0.231245
2013-01-02,1.284521,1.31686,0.812206,1.472196
2013-01-05,0.747896,1.522282,-0.18526,-0.553524


# Selection


In [21]:
#We can get a particular value of the column using get item '[]'
df["A"]



2013-01-01   -0.526508
2013-01-02    1.284521
2013-01-03    0.673191
2013-01-04    1.430661
2013-01-05    0.747896
2013-01-06    0.292110
Freq: D, Name: A, dtype: float64

In [23]:
# Will return data from 0 to 3

df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.526508,0.630103,-0.321195,-0.1478
2013-01-02,1.284521,1.31686,0.812206,1.472196
2013-01-03,0.673191,-0.700436,0.543318,0.38929


In [24]:
# Selection by label
df.loc[dates[0]]

A   -0.526508
B    0.630103
C   -0.321195
D   -0.147800
Name: 2013-01-01 00:00:00, dtype: float64

In [25]:
df.loc[:,["A","B"]]

Unnamed: 0,A,B
2013-01-01,-0.526508,0.630103
2013-01-02,1.284521,1.31686
2013-01-03,0.673191,-0.700436
2013-01-04,1.430661,0.740514
2013-01-05,0.747896,1.522282
2013-01-06,0.29211,1.046643


In [None]:
# Single Selection returns row and column scalar 
df.loc[dates[0],"A"]

np.float64(-0.5265084406256869)

In [27]:
#faster access to Scalar value
df.at[dates[0],"A"]

np.float64(-0.5265084406256869)

In [28]:
# Selection by position
df.iloc[3]

A    1.430661
B    0.740514
C    0.235313
D   -0.988487
Name: 2013-01-04 00:00:00, dtype: float64

In [29]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,1.430661,0.740514
2013-01-05,0.747896,1.522282


In [30]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,1.284521,0.812206
2013-01-03,0.673191,0.543318
2013-01-05,0.747896,-0.18526


In [31]:
#Slicing Explicitly
df.iloc[1:3,:]


Unnamed: 0,A,B,C,D
2013-01-02,1.284521,1.31686,0.812206,1.472196
2013-01-03,0.673191,-0.700436,0.543318,0.38929


In [32]:
df.iloc[:, 1:3]


Unnamed: 0,B,C
2013-01-01,0.630103,-0.321195
2013-01-02,1.31686,0.812206
2013-01-03,-0.700436,0.543318
2013-01-04,0.740514,0.235313
2013-01-05,1.522282,-0.18526
2013-01-06,1.046643,0.079365


In [33]:
df.iloc[1, 1]

np.float64(1.3168601163500486)

In [34]:
#Boolean Indexing
df[df["A"]>0]

Unnamed: 0,A,B,C,D
2013-01-02,1.284521,1.31686,0.812206,1.472196
2013-01-03,0.673191,-0.700436,0.543318,0.38929
2013-01-04,1.430661,0.740514,0.235313,-0.988487
2013-01-05,0.747896,1.522282,-0.18526,-0.553524
2013-01-06,0.29211,1.046643,0.079365,0.231245


In [35]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,,0.630103,,
2013-01-02,1.284521,1.31686,0.812206,1.472196
2013-01-03,0.673191,,0.543318,0.38929
2013-01-04,1.430661,0.740514,0.235313,
2013-01-05,0.747896,1.522282,,
2013-01-06,0.29211,1.046643,0.079365,0.231245


In [36]:
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.526508,0.630103,-0.321195,-0.1478,one
2013-01-02,1.284521,1.31686,0.812206,1.472196,one
2013-01-03,0.673191,-0.700436,0.543318,0.38929,two
2013-01-04,1.430661,0.740514,0.235313,-0.988487,three
2013-01-05,0.747896,1.522282,-0.18526,-0.553524,four
2013-01-06,0.29211,1.046643,0.079365,0.231245,three


In [37]:
df2[df2["E"].isin(["two","four"])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.673191,-0.700436,0.543318,0.38929,two
2013-01-05,0.747896,1.522282,-0.18526,-0.553524,four


# Setting


In [38]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6))
s1


2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [40]:
# Setting value by label

df.at[dates[0],"A"] = 0


In [41]:
# Setting value by position
df.iat[0,1] = 0

In [44]:
#Setting by assiging with a numpy array
df.loc[:, "D"] = np.array([5] * len(df))
df


Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.321195,5.0
2013-01-02,1.284521,1.31686,0.812206,5.0
2013-01-03,0.673191,-0.700436,0.543318,5.0
2013-01-04,1.430661,0.740514,0.235313,5.0
2013-01-05,0.747896,1.522282,-0.18526,5.0
2013-01-06,0.29211,1.046643,0.079365,5.0
