In [None]:
# we use 'pandas for overall data manipulation in this session
# Pandas is a very famous python library for data analysis. We can effectively manipulate, filter and create data
# using Pandas.
# Data Frames are the backbone of Pandas

In [2]:
# Let's represent a two-dimensional data structure using Python dictionary
people = {
    "name": ["Jon", "Jane", "Ken", "Kevin"],
    "email": ["jon@email.com", "jane@email.com", "ken@email.com", "kecin@email.com"],
    "address": ["KTM", "BKT", "PKR", "LTP"]
}


In [4]:
import pandas as pd
df = pd.DataFrame(people)
df

Unnamed: 0,name,email,address
0,Jon,jon@email.com,KTM
1,Jane,jane@email.com,BKT
2,Ken,ken@email.com,PKR
3,Kevin,kecin@email.com,LTP


In [5]:
df.tail()

Unnamed: 0,name,email,address
0,Jon,jon@email.com,KTM
1,Jane,jane@email.com,BKT
2,Ken,ken@email.com,PKR
3,Kevin,kecin@email.com,LTP


In [6]:
df.tail(1)


Unnamed: 0,name,email,address
3,Kevin,kecin@email.com,LTP


In [7]:
df.shape()


TypeError: 'tuple' object is not callable

In [8]:
df.head(2)

Unnamed: 0,name,email,address
0,Jon,jon@email.com,KTM
1,Jane,jane@email.com,BKT


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     4 non-null      object
 1   email    4 non-null      object
 2   address  4 non-null      object
dtypes: object(3)
memory usage: 228.0+ bytes


In [10]:
df.shape

(4, 3)

In [11]:
# df.shape, df.tail, df.head(), 
# we can also access the column / series using a dot (.) operator
df.email
# but using '.' is not recommended because it may conflict with the dataframe attributes
# df[["name", "email"]]
# type(r) -dataframe
# df.columns
# df.iloc[0] = it gives the first row of the dataframe
# iloc stands for integer-location.
# we can give row and columns in teh iloc
# if we want to provide multiple rows and multiple columns then they should be enclosed in [].

# df.iloc[[0,1]]
# df.iloc[0,1] = here 0 index is for row and 1 index is for column
# df.iloc[[0,1],1] = here [0,1] are rows and 1 is for column
# df.iloc[[o,1],[o,1]] # here [o,1] are rows and [0,1] are columns

0      jon@email.com
1     jane@email.com
2      ken@email.com
3    kecin@email.com
Name: email, dtype: object

In [12]:
# We also have a 'loc' mehod for a dataframe
# df.loc[[0,1], ["name", "email"]
# df.loc[0:3, "name":"email"]



In [13]:
# we can also set the index of our rows
# df.setindex

# new_df = df.set_index('email')
# df.set_index('email', inplaces=True)
# df.loc[p:jon@email.com", "kevin@email.com"], ["name", "address"]]

# df.reset_index(inplace=True) = this removes the custom indexing and reset to default

In [14]:
# df.sort_index()
# df.sort_index(ascending=False)

In [None]:
# Filtering
# df["address"] == "PKR" # it is just a masking
# filt = df["address"] == "PKR"
# df[filt] or dilr = (df["address"] == "PKR")
# df.loc[filt]

# df.loc[filt, ["address"]]
# df.loc[filt, ["address", "name"]]

# filt = (df["address"] == "PKR") | (df["name"] == "kevin")
# df.loc[filt]
# df.loc[~filt] = refers to negate or exclude

In [16]:
df.email


0      jon@email.com
1     jane@email.com
2      ken@email.com
3    kecin@email.com
Name: email, dtype: object

In [18]:
type(df['email']) # Series
# Series is a single column of a dataframe
# Dataframe is a collection of such series

pandas.core.series.Series

In [19]:
# we can also access teh column / series using a dot (.) operator
df.email
# but using dot (.) is not recommended becuz it may conflict with the dataframe attributes

0      jon@email.com
1     jane@email.com
2      ken@email.com
3    kecin@email.com
Name: email, dtype: object

In [20]:
df[['name', 'email']]

Unnamed: 0,name,email
0,Jon,jon@email.com
1,Jane,jane@email.com
2,Ken,ken@email.com
3,Kevin,kecin@email.com


In [21]:
df.columns

Index(['name', 'email', 'address'], dtype='object')

In [22]:
df.iloc[0] # it gives the first row of the dataframe
# iloc stands for integer-location
# We can give rows and columns in the iloc
# If we want to provide multiple rows and multiple columns then they should be enclosed in [].

name                 Jon
email      jon@email.com
address              KTM
Name: 0, dtype: object

In [23]:
df.iloc[0,1] # here o inidex is for row and 1 index is for cloumn

'jon@email.com'

In [26]:
df.iloc[2,2]

'PKR'

In [28]:
df.iloc[[0,2]]

Unnamed: 0,name,email,address
0,Jon,jon@email.com,KTM
2,Ken,ken@email.com,PKR


In [29]:
df.iloc[[0,1], 2] # here [0,1] are for rows and 1 is for column

0    KTM
1    BKT
Name: address, dtype: object

In [30]:
df.iloc[0:3, 0:2]

Unnamed: 0,name,email
0,Jon,jon@email.com
1,Jane,jane@email.com
2,Ken,ken@email.com


In [32]:
df.iloc[[0,1], 1:3]

Unnamed: 0,email,address
0,jon@email.com,KTM
1,jane@email.com,BKT


In [33]:
# We alos have a 'loc' method for a dataframe
df.loc[[0,1], ["name", "address"]]


Unnamed: 0,name,address
0,Jon,KTM
1,Jane,BKT


In [34]:
df.loc[1:3, "name":"email"]

Unnamed: 0,name,email
1,Jane,jane@email.com
2,Ken,ken@email.com
3,Kevin,kecin@email.com


In [35]:
# we can lso set index for our rows
df.set_index("email")

Unnamed: 0_level_0,name,address
email,Unnamed: 1_level_1,Unnamed: 2_level_1
jon@email.com,Jon,KTM
jane@email.com,Jane,BKT
ken@email.com,Ken,PKR
kecin@email.com,Kevin,LTP


In [36]:
df


Unnamed: 0,name,email,address
0,Jon,jon@email.com,KTM
1,Jane,jane@email.com,BKT
2,Ken,ken@email.com,PKR
3,Kevin,kecin@email.com,LTP


In [37]:
df.set_index("email", inplace= True)

In [38]:
df


Unnamed: 0_level_0,name,address
email,Unnamed: 1_level_1,Unnamed: 2_level_1
jon@email.com,Jon,KTM
jane@email.com,Jane,BKT
ken@email.com,Ken,PKR
kecin@email.com,Kevin,LTP


In [39]:
df.loc[["jon@email.com", "kecin@email.com"], ["name", "address"]]

Unnamed: 0_level_0,name,address
email,Unnamed: 1_level_1,Unnamed: 2_level_1
jon@email.com,Jon,KTM
kecin@email.com,Kevin,LTP


In [40]:
df.iloc[[0,1], [0,1]]

Unnamed: 0_level_0,name,address
email,Unnamed: 1_level_1,Unnamed: 2_level_1
jon@email.com,Jon,KTM
jane@email.com,Jane,BKT


In [41]:
df.reset_index(inplace=True) # This removes the custom indexing and reset to default

In [42]:
df

Unnamed: 0,email,name,address
0,jon@email.com,Jon,KTM
1,jane@email.com,Jane,BKT
2,ken@email.com,Ken,PKR
3,kecin@email.com,Kevin,LTP


In [43]:
df.sort_index()


Unnamed: 0,email,name,address
0,jon@email.com,Jon,KTM
1,jane@email.com,Jane,BKT
2,ken@email.com,Ken,PKR
3,kecin@email.com,Kevin,LTP


In [45]:
df.sort_index(ascending = False)

Unnamed: 0,email,name,address
3,kecin@email.com,Kevin,LTP
2,ken@email.com,Ken,PKR
1,jane@email.com,Jane,BKT
0,jon@email.com,Jon,KTM


In [47]:
# filtering
df["address"] == "BKT"

0    False
1     True
2    False
3    False
Name: address, dtype: bool

In [48]:
filt = df["address"] == "BKT"
df[filt]

Unnamed: 0,email,name,address
1,jane@email.com,Jane,BKT


In [49]:
df.loc[filt]

Unnamed: 0,email,name,address
1,jane@email.com,Jane,BKT


In [50]:
df.loc[filt, "email"]

1    jane@email.com
Name: email, dtype: object

In [51]:
df


Unnamed: 0,email,name,address
0,jon@email.com,Jon,KTM
1,jane@email.com,Jane,BKT
2,ken@email.com,Ken,PKR
3,kecin@email.com,Kevin,LTP


In [53]:
filt = (df["address"] == "PKR") | (df["email"] == "jon@email.com")
df.loc[filt]

Unnamed: 0,email,name,address
0,jon@email.com,Jon,KTM
2,ken@email.com,Ken,PKR


In [54]:
df.loc[~filt]

Unnamed: 0,email,name,address
1,jane@email.com,Jane,BKT
3,kecin@email.com,Kevin,LTP
