# Dataframes

In [1]:
# Import the necessary libraries
import numpy as np
import pandas as pd 

print("Successfully imported numpy and pandas")

Successfully imported numpy and pandas


In [2]:
# Setting the seed value
np.random.seed(101)

In [3]:
# Creating a dataframe using numpy array
from numpy.random import randn
np.random.randn(5, 4)

array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])

In [4]:
# Creating a DataFrame using a numpy array
df = pd.DataFrame(np.random.randn(5, 4), index = 'A B C D E'.split(), columns = 'W X Y Z'.split())

In [5]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


## Indexing columns of the dataframe

Dataframe columns are just Series.

In [6]:
df['W']

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: W, dtype: float64

In [7]:
type(df['W'])

pandas.core.series.Series

In [8]:
df['Z']

A   -1.159119
B    0.184502
C    0.329646
D    0.484752
E    1.996652
Name: Z, dtype: float64

In [9]:
type(df['Z'])

pandas.core.series.Series

## Indexing Rows of the dataframe

Dataframe rows are also just Series

In [10]:
df.loc["A"]

W    0.302665
X    1.693723
Y   -1.706086
Z   -1.159119
Name: A, dtype: float64

In [11]:
type(df.loc["A"])

pandas.core.series.Series

In [12]:
df.loc["C"]

W    0.807706
X    0.072960
Y    0.638787
Z    0.329646
Name: C, dtype: float64

In [13]:
type(df.loc["A"])

pandas.core.series.Series

## Selection of Multiples Columns or Rows of the Dataframe

In [14]:
# Selecting multiple columns from the dataframe df
df[['W', 'Z']]

Unnamed: 0,W,Z
A,0.302665,-1.159119
B,-0.134841,0.184502
C,0.807706,0.329646
D,-0.497104,0.484752
E,-0.116773,1.996652


In [15]:
# Selecting multiple rows from the dataframe df
df.loc[["A", "D"]]

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
D,-0.497104,-0.75407,-0.943406,0.484752


In [16]:
# Selecting selected rows of particular columns
df.loc[["A", "C"]][["W", "Y"]]

Unnamed: 0,W,Y
A,0.302665,-1.706086
C,0.807706,0.638787


## Conditional Selection

In [17]:
# Selecting based on a conditon
df[df["W"] < 0]

Unnamed: 0,W,X,Y,Z
B,-0.134841,0.390528,0.166905,0.184502
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [18]:
df.loc[df["W"] < 0]                  ### df[df["W"] < 0] == df.loc[df["W"] < 0]

Unnamed: 0,W,X,Y,Z
B,-0.134841,0.390528,0.166905,0.184502
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [19]:
df.loc[df["W"] < 0]

Unnamed: 0,W,X,Y,Z
B,-0.134841,0.390528,0.166905,0.184502
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [20]:
df[df["Y"] > 0] 

Unnamed: 0,W,X,Y,Z
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
E,-0.116773,1.901755,0.238127,1.996652


## Creating new variable in the data frame

In [21]:
# Creating a variable V using the variable Z
df["V"] = df["Z"] + 20

In [22]:
df

Unnamed: 0,W,X,Y,Z,V
A,0.302665,1.693723,-1.706086,-1.159119,18.840881
B,-0.134841,0.390528,0.166905,0.184502,20.184502
C,0.807706,0.07296,0.638787,0.329646,20.329646
D,-0.497104,-0.75407,-0.943406,0.484752,20.484752
E,-0.116773,1.901755,0.238127,1.996652,21.996652


In [23]:
df["U"] = df["Z"] - df['Z'] * df["V"]
df

Unnamed: 0,W,X,Y,Z,V,U
A,0.302665,1.693723,-1.706086,-1.159119,18.840881,20.679711
B,-0.134841,0.390528,0.166905,0.184502,20.184502,-3.539576
C,0.807706,0.07296,0.638787,0.329646,20.329646,-6.371946
D,-0.497104,-0.75407,-0.943406,0.484752,20.484752,-9.445265
E,-0.116773,1.901755,0.238127,1.996652,21.996652,-41.923014


## Dropping the columns from the data frame

In [24]:
# Dropping a single column
df.drop("U", axis = 1)

Unnamed: 0,W,X,Y,Z,V
A,0.302665,1.693723,-1.706086,-1.159119,18.840881
B,-0.134841,0.390528,0.166905,0.184502,20.184502
C,0.807706,0.07296,0.638787,0.329646,20.329646
D,-0.497104,-0.75407,-0.943406,0.484752,20.484752
E,-0.116773,1.901755,0.238127,1.996652,21.996652


In [25]:
# Dropping multiple columns
df.drop(["V", "U"], axis = 1)

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [26]:
df

Unnamed: 0,W,X,Y,Z,V,U
A,0.302665,1.693723,-1.706086,-1.159119,18.840881,20.679711
B,-0.134841,0.390528,0.166905,0.184502,20.184502,-3.539576
C,0.807706,0.07296,0.638787,0.329646,20.329646,-6.371946
D,-0.497104,-0.75407,-0.943406,0.484752,20.484752,-9.445265
E,-0.116773,1.901755,0.238127,1.996652,21.996652,-41.923014


To permanently drop some columns from the dataframe use the arguement "inplace = True"

In [27]:
df.drop(["V", "U"], axis = 1, inplace = True)

In [28]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [29]:
# Dropping rows by index
df.drop(["A", "C"])

Unnamed: 0,W,X,Y,Z
B,-0.134841,0.390528,0.166905,0.184502
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652
