In [1]:
# Pandas is a python library used for data manipulation and analysis 
#two key data structures - series objects and dataframes 

In [2]:
###Pandas dataframes###
# 2-dimensional labeled data structure as denoted by index and columns
#analogous to spreadsheet or SQL table

In [3]:
import pandas as pd
import numpy as np

In [14]:
lst = [[1,2,3,4],[10,20,30,40]]

In [15]:
df = pd.DataFrame(data=lst)

In [16]:
df

Unnamed: 0,0,1,2,3
0,1,2,3,4
1,10,20,30,40


In [17]:
#rows are index(s)
#columns are columns

In [28]:
#create dataframe with index and column names from nested list

#Name the rows using 'index' parameter
#Name the column using the 'columns' parameter

In [38]:
a = [[10,20,30],[40,50,60]]

adf = pd.DataFrame(data=a,index=['X','Y'],columns=['a','b','c'])

In [39]:
adf

Unnamed: 0,a,b,c
X,10,20,30
Y,40,50,60


In [40]:
#create dataframe using dictionary notation 
# This will transpose the entries from the above dataframe

In [41]:
lst1 = [10,20,30]
lst2 = [40,50,60]

df = pd.DataFrame(data={'X':lst1,'Y':lst2},index=['a','b','c'])

In [42]:
df

Unnamed: 0,X,Y
a,10,40
b,20,50
c,30,60


In [43]:
#create dataframe using dictionary notation using 2 dictionaries 

In [45]:
dct1 = {'aa':10,'bb':20,'cc':30}
dct2 = {'aa':100,'bb':200,'cc':300}

df = pd.DataFrame(data={'X':dct1,'Y':dct2})

In [46]:
df

Unnamed: 0,X,Y
aa,10,100
bb,20,200
cc,30,300


In [47]:
#create dataframe using dictionary notation using 2 series

In [48]:
ser1 = pd.Series([10,20,30],index = ['aa','bb','cc'])
ser2 = pd.Series([100,200,300],index = ['aa','bb','cc'])

In [49]:
df = pd.DataFrame(data={'X':ser1,'Y':ser2})

In [50]:
df

Unnamed: 0,X,Y
aa,10,100
bb,20,200
cc,30,300


In [51]:
#generate a dataframe from the Standard Normal distribution

In [60]:
np.random.seed(0)
df = pd.DataFrame(data=np.random.normal(size=(3,5)),
                  index=['X','Y','Z'],
                  columns=['aa','bb','cc','dd','ee'])

In [61]:
df

Unnamed: 0,aa,bb,cc,dd,ee
X,1.764052,0.400157,0.978738,2.240893,1.867558
Y,-0.977278,0.950088,-0.151357,-0.103219,0.410599
Z,0.144044,1.454274,0.761038,0.121675,0.443863


In [None]:
#Useful Pandas attributes for Dataframes

In [62]:
#dimensionality - how many rows and how many columns 
df.shape

(3, 5)

In [63]:
# How many elements in total
df.size

15

In [64]:
# all of the index labels
df.index

Index(['X', 'Y', 'Z'], dtype='object')

In [65]:
#all column labels
df.columns

Index(['aa', 'bb', 'cc', 'dd', 'ee'], dtype='object')

In [66]:
#array data structure of all values in the dataframe

In [67]:
df.values

array([[ 1.76405235,  0.40015721,  0.97873798,  2.2408932 ,  1.86755799],
       [-0.97727788,  0.95008842, -0.15135721, -0.10321885,  0.4105985 ],
       [ 0.14404357,  1.45427351,  0.76103773,  0.12167502,  0.44386323]])

In [69]:
#return n number of rows
df.head(1)

Unnamed: 0,aa,bb,cc,dd,ee
X,1.764052,0.400157,0.978738,2.240893,1.867558


In [71]:
#infomration on what each column and row denote and what is teh data type
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, X to Z
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   aa      3 non-null      float64
 1   bb      3 non-null      float64
 2   cc      3 non-null      float64
 3   dd      3 non-null      float64
 4   ee      3 non-null      float64
dtypes: float64(5)
memory usage: 144.0+ bytes


In [72]:
#attempts to summarize data in the dataframe and generate descriptive statistics 
df.describe()

Unnamed: 0,aa,bb,cc,dd,ee
count,3.0,3.0,3.0,3.0,3.0
mean,0.310273,0.93484,0.529473,0.753116,0.90734
std,1.378204,0.527224,0.599579,1.29335,0.83174
min,-0.977278,0.400157,-0.151357,-0.103219,0.410599
25%,-0.416617,0.675123,0.30484,0.009228,0.427231
50%,0.144044,0.950088,0.761038,0.121675,0.443863
75%,0.954048,1.202181,0.869888,1.181284,1.155711
max,1.764052,1.454274,0.978738,2.240893,1.867558


In [74]:
#transpose
df.transpose()

Unnamed: 0,X,Y,Z
aa,1.764052,-0.977278,0.144044
bb,0.400157,0.950088,1.454274
cc,0.978738,-0.151357,0.761038
dd,2.240893,-0.103219,0.121675
ee,1.867558,0.410599,0.443863
