# Create a dataframe using np.random.randn with 5 rows & 4 columns, and perform various operations using pandas library

In [1]:
# Import libraries
import numpy as np
import pandas as pd

In [2]:
# Create dataframe
df=pd.DataFrame(np.random.randn(5,4),index=['a','b','c','d','e'],columns=['one','two','three','four'])
df

Unnamed: 0,one,two,three,four
a,1.584676,-0.429047,-0.010274,0.923868
b,0.259563,1.220298,0.051023,1.613023
c,-0.545955,-0.721365,0.967211,-0.781962
d,-0.99535,-0.732352,-0.655113,0.608756
e,-1.022464,0.710392,-0.65731,0.798035


In [3]:
# Get information of dataframe or dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, a to e
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   one     5 non-null      float64
 1   two     5 non-null      float64
 2   three   5 non-null      float64
 3   four    5 non-null      float64
dtypes: float64(4)
memory usage: 200.0+ bytes


In [4]:
# Get statistical analysis of all data in data frame
# count, mean, standard deviation, minimum, maximum, percentiles(25th,50th,75th) 
df.describe()

Unnamed: 0,one,two,three,four
count,5.0,5.0,5.0,5.0
mean,-0.143906,0.009585,-0.060893,0.632344
std,1.0965,0.899182,0.667251,0.876782
min,-1.022464,-0.732352,-0.65731,-0.781962
25%,-0.99535,-0.721365,-0.655113,0.608756
50%,-0.545955,-0.429047,-0.010274,0.798035
75%,0.259563,0.710392,0.051023,0.923868
max,1.584676,1.220298,0.967211,1.613023


In [5]:
# Correlation convert numbers b/w -1 to 1
df.corr()

Unnamed: 0,one,two,three,four
one,1.0,0.010185,0.256013,0.349163
two,0.010185,1.0,-0.256382,0.706143
three,0.256013,-0.256382,1.0,-0.604387
four,0.349163,0.706143,-0.604387,1.0


In [6]:
# Check for null values
df.isnull()

Unnamed: 0,one,two,three,four
a,False,False,False,False
b,False,False,False,False
c,False,False,False,False
d,False,False,False,False
e,False,False,False,False


In [7]:
# Total number of null values
df.isnull().sum()

one      0
two      0
three    0
four     0
dtype: int64

In [8]:
# no of distinct elements
df.nunique()

one      5
two      5
three    5
four     5
dtype: int64

In [9]:
# return unique values in given column
df['one'].unique()

array([ 1.5846762 ,  0.25956268, -0.54595489, -0.99535033, -1.02246422])

In [10]:
# Count how many times a value occured in entire column
df["one"].value_counts()

-0.545955    1
-1.022464    1
 1.584676    1
 0.259563    1
-0.995350    1
Name: one, dtype: int64

In [11]:
# operation
df[df>1]

Unnamed: 0,one,two,three,four
a,1.584676,,,
b,,1.220298,,1.613023
c,,,,
d,,,,
e,,,,


In [12]:
# Shape of dataframe
df.shape

(5, 4)

In [13]:
# Return top 2 rows
df.head(2)

Unnamed: 0,one,two,three,four
a,1.584676,-0.429047,-0.010274,0.923868
b,0.259563,1.220298,0.051023,1.613023


In [14]:
# Return bottom 2 rows
df.tail(2)

Unnamed: 0,one,two,three,four
d,-0.99535,-0.732352,-0.655113,0.608756
e,-1.022464,0.710392,-0.65731,0.798035


In [15]:
# Display values from indexes
df.iloc[1:4,1:3]

Unnamed: 0,two,three
b,1.220298,0.051023
c,-0.721365,0.967211
d,-0.732352,-0.655113


In [16]:
df[2:4]

Unnamed: 0,one,two,three,four
c,-0.545955,-0.721365,0.967211,-0.781962
d,-0.99535,-0.732352,-0.655113,0.608756


In [17]:
# Drop all NaN values
df.dropna(axis=1)

Unnamed: 0,one,two,three,four
a,1.584676,-0.429047,-0.010274,0.923868
b,0.259563,1.220298,0.051023,1.613023
c,-0.545955,-0.721365,0.967211,-0.781962
d,-0.99535,-0.732352,-0.655113,0.608756
e,-1.022464,0.710392,-0.65731,0.798035


In [18]:
# Display multiple columns
df[['two','three']]

Unnamed: 0,two,three
a,-0.429047,-0.010274
b,1.220298,0.051023
c,-0.721365,0.967211
d,-0.732352,-0.655113
e,0.710392,-0.65731
