In [3]:
#Categorical data
import pandas as pd
s = pd.Series(["a","b","c","a","c"], dtype="category")
s

0    a
1    b
2    c
3    a
4    c
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [9]:
df = pd.DataFrame({"A":["a","b","c","a", "c"]})
df["B"] = df["A"].astype('category')
df["C"] = pd.Categorical(df["A"])
df
"""One difference between the two approaches is that when you use the pd.Categorical() function, a new Categorical object is created that contains the unique values of the original column "A". This can be useful if you want to perform operations that involve these unique values, such as grouping or aggregating.

Another difference is that if the original column "A" contains missing values (NaN), the pd.Categorical() function will create a separate category for these missing values, while the astype() method will simply leave them as NaNs."""
df.dtypes

A      object
B    category
C    category
dtype: object

In [13]:
#Creating large random datasets
import numpy as np
df = pd.DataFrame(np.random.choice(['foo','bar','baz'], size=(100000,3)))
df = df.apply(lambda col: col.astype('category'))
df.head()
df.dtypes
df.shape

(100000, 3)

In [15]:
#Correlation Between Columns
df = pd.DataFrame(np.random.randn(1000, 3), columns=['a', 'b', 'c'])
df.corr()

Unnamed: 0,a,b,c
a,1.0,0.03378,0.01214
b,0.03378,1.0,-0.031378
c,0.01214,-0.031378,1.0


In [18]:
"""DataFrame is a data structure provided by pandas library,apart from Series & Panel. It is a 2-
dimensional structure & can be compared to a table of rows and columns.
Each row can be identified by an integer index (0..N) or a label explicitly set when creating a
DataFrame object. Each column can be of distinct type and is identified by a label."""
df = pd.DataFrame({'numbers': [1, 2, 3], 'colors': ['red', 'white', 'blue']})
df

"""Pandas orders columns alphabetically as dict are not ordered. To specify the order, use the
columns parameter"""

df = pd.DataFrame({'numbers': [1, 2, 3], 'colors': ['red', 'white', 'blue']},
columns=['numbers', 'colors'])

df

Unnamed: 0,numbers,colors
0,1,red
1,2,white
2,3,blue


In [19]:
#Create a DataFrame of random numbers:
# Set the seed for a reproducible sample
np.random.seed(0)
df = pd.DataFrame(np.random.randn(5, 3), columns=list('ABC'))
print(df)

          A         B         C
0  1.764052  0.400157  0.978738
1  2.240893  1.867558 -0.977278
2  0.950088 -0.151357 -0.103219
3  0.410599  0.144044  1.454274
4  0.761038  0.121675  0.443863


In [24]:
#Create a DataFrame with integers
df = pd.DataFrame(np.arange(15).reshape(5,3),columns=list('ABC'))
print(df)

    A   B   C
0   0   1   2
1   3   4   5
2   6   7   8
3   9  10  11
4  12  13  14


In [26]:
#Create a DataFrame and include nans (NaT, NaN, 'nan', None) across columns and rows:
df = pd.DataFrame(np.arange(48).reshape(8,6),columns=list('ABCDEF'))
print(df)

    A   B   C   D   E   F
0   0   1   2   3   4   5
1   6   7   8   9  10  11
2  12  13  14  15  16  17
3  18  19  20  21  22  23
4  24  25  26  27  28  29
5  30  31  32  33  34  35
6  36  37  38  39  40  41
7  42  43  44  45  46  47


In [30]:
df.iloc[::2,0] = np.nan # in column 0, set elements with indices 0,2,4, ... to NaN
df.iloc[::4,1] = pd.NaT # in column 1, set elements with indices 0,4, ... to np.NaT
df.iloc[:3,2] = 'nan' # in column 2, set elements with index from 0 to 3 to 'nan'
df.iloc[:,5] = None # in column 5, set all elements to None
df.iloc[5,:] = None # in row 5, set all elements to None
df.iloc[7,:] = np.nan # in row 7, set all elements to NaN
df

Unnamed: 0,A,B,C,D,E,F
0,,NaT,,3.0,4.0,
1,6.0,7,,9.0,10.0,
2,,13,,15.0,16.0,
3,18.0,19,20.0,21.0,22.0,
4,,NaT,26.0,27.0,28.0,
5,,,,,,
6,,37,38.0,39.0,40.0,
7,,,,,,


In [32]:
#Create a sample DataFrame from multiple collections using Dictionary
np.random.seed(123)
x = np.random.standard_normal(4)
y = range(4)
df = pd.DataFrame({'X':x, 'Y':y})
df

Unnamed: 0,X,Y
0,-1.085631,0
1,0.997345,1
2,0.282978,2
3,-1.506295,3


In [33]:
#Create a DataFrame from a list of tuples
data = [
('p1', 't1', 1, 2),
('p1', 't2', 3, 4),
('p2', 't1', 5, 6),
('p2', 't2', 7, 8),
('p2', 't3', 2, 8)
]
df = pd.DataFrame(data)
print(df)


    0   1  2  3
0  p1  t1  1  2
1  p1  t2  3  4
2  p2  t1  5  6
3  p2  t2  7  8
4  p2  t3  2  8


In [34]:
#Create a DataFrame from a dictionary of lists
df = pd.DataFrame({'A' : [1, 2, 3, 4],
 'B' : [4, 3, 2, 1]})
df
#If the arrays are not the same length an error is raise

Unnamed: 0,A,B
0,1,4
1,2,3
2,3,2
3,4,1


In [35]:
#Using ndarray
np.random.seed(123)
x = np.random.standard_normal(4)
y = range(4)
df = pd.DataFrame({'X':x, 'Y':y})
df


Unnamed: 0,X,Y
0,-1.085631,0
1,0.997345,1
2,0.282978,2
3,-1.506295,3


In [37]:
#Create a sample DataFrame with datetime
np.random.seed(0)
# create an array of 5 dates starting at '2015-02-24', one per minute
rng = pd.date_range('2015-02-24', periods=5, freq='T')
df = pd.DataFrame({ 'Date': rng, 'Val': np.random.randn(len(rng)) })
df

Unnamed: 0,Date,Val
0,2015-02-24 00:00:00,1.764052
1,2015-02-24 00:01:00,0.400157
2,2015-02-24 00:02:00,0.978738
3,2015-02-24 00:03:00,2.240893
4,2015-02-24 00:04:00,1.867558


In [39]:
# create an array of 5 dates starting at '2015-02-24', one per day
rng = pd.date_range('2015-02-24', periods=5, freq='D')
df = pd.DataFrame({ 'Date': rng, 'Val' : np.random.randn(len(rng))})
df

Unnamed: 0,Date,Val
0,2015-02-24,-0.977278
1,2015-02-25,0.950088
2,2015-02-26,-0.151357
3,2015-02-27,-0.103219
4,2015-02-28,0.410599


In [None]:
Alias Description
B business day frequency
C custom business day frequency (experimental)
D calendar day frequency
W weekly frequency
M month end frequency
BM business month end frequency
CBM custom business month end frequency
MS month start frequency
BMS business month start frequency
CBMS custom business month start frequency
Q quarter end frequency
BQ business quarter endfrequency
QS quarter start frequency
BQS business quarter start frequency
A year end frequency
BA business year end frequency
AS year start frequency
BAS business year start frequency
BH business hour frequency
H hourly frequency
T, min minutely frequency
S secondly frequency
L, ms milliseconds
U, us microseconds
N nanoseconds