<a href="https://colab.research.google.com/github/Adeelzafar/Adeelzafar/blob/main/Pandas_Crash_Course_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pandas Introduction

In [None]:
# In this series of lecture, we will study:
# Why we use Pandas
# Pandas Series and Dataframe
# Missing Data

In [None]:
# Pandas stands for Panel-Data and its powerful for data handling
# Pandas is also used for feature learning 

In [None]:
# Lets jump into Pandas Series 
# Pandas Series is very similar to numpy array except from addition of named index


# Lets Start the Code

In [None]:
#importing libraries
import numpy as np 
import pandas as pd

In [None]:
# Creating different data 
labels = ['a','b','c']
mylist = [10,20,20]
d= {'a':20, 'b':30 , 'c':40}
array = np.arange(10,15,2)

In [None]:
pd.Series(data =mylist, index = labels)

a    10
b    20
c    20
dtype: int64

In [None]:
array

array([10, 12, 14])

In [None]:
pd.Series(array , labels)

a    10
b    12
c    14
dtype: int64

In [None]:
pd.Series(d)

a    20
b    30
c    40
dtype: int64

In [None]:
#Lets create sales data 
salesQ1 = pd.Series(data = [250,250,200,150],
                    index = ['USA','China','India','Brazil'])

In [None]:
salesQ1

USA       250
China     250
India     200
Brazil    150
dtype: int64

In [None]:
salesQ2 = pd.Series(data = [100,250,300,150],
                    index = ['USA','China','India','Japan'])

In [None]:
salesQ2

USA      100
China    250
India    300
Japan    150
dtype: int64

In [None]:
salesQ2['China']

250

In [None]:
# Interesting insight when index labels do not match 
# for example Brazil and Japan 
salesQ1 + salesQ2

Brazil      NaN
China     500.0
India     500.0
Japan       NaN
USA       350.0
dtype: float64

In [None]:
# Pandas Dataframe is multple pandas series that share same index
# Pandas Dataframe is similar to spreadsheet

In [None]:
from numpy.random import randint
data = randint(-100,100,(5,4))
data



array([[-38,  27,  84,  61],
       [-42,  52,  63,  27],
       [ 65,  26,  18,  87],
       [ 18, -34,  89, -42],
       [-43, -67, -60, -46]])

In [None]:
index = ['A','B','C','D','E']
column=['W','X','Y','Z']

In [None]:
# now lets create a dataframe
df = pd.DataFrame(data,index,column)
df

Unnamed: 0,W,X,Y,Z
A,-38,27,84,61
B,-42,52,63,27
C,65,26,18,87
D,18,-34,89,-42
E,-43,-67,-60,-46


In [None]:
# Lets grab a single column
df['W']

A   -38
B   -42
C    65
D    18
E   -43
Name: W, dtype: int64

In [None]:
# Lets grab list of columns
df[['W','Z']]

Unnamed: 0,W,Z
A,-38,61
B,-42,27
C,65,87
D,18,-42
E,-43,-46


In [None]:
#Lets create a new column
df['new']=df['W'] + df['Z']
df

Unnamed: 0,W,X,Y,Z,new
A,-38,27,84,61,23
B,-42,52,63,27,-15
C,65,26,18,87,152
D,18,-34,89,-42,-24
E,-43,-67,-60,-46,-89


In [None]:
# Lets drop a column
df = df.drop('new',axis = 1)


In [None]:
df

Unnamed: 0,W,X,Y,Z
A,-38,27,84,61
B,-42,52,63,27
C,65,26,18,87
D,18,-34,89,-42
E,-43,-67,-60,-46


In [None]:
# Now lets work with Rows 
df.loc['A']

W   -38
X    27
Y    84
Z    61
Name: A, dtype: int64

In [None]:
df.loc[['A','E']]

Unnamed: 0,W,X,Y,Z
A,-38,27,84,61
E,-43,-67,-60,-46


In [None]:
df.iloc[0]

W   -38
X    27
Y    84
Z    61
Name: A, dtype: int64

In [None]:
df.iloc[0:3]

Unnamed: 0,W,X,Y,Z
A,-38,27,84,61
B,-42,52,63,27
C,65,26,18,87


In [None]:
df = df.drop('C')

In [None]:
df

Unnamed: 0,W,X,Y,Z
A,-38,27,84,61
B,-42,52,63,27
D,18,-34,89,-42
E,-43,-67,-60,-46


In [None]:
# Lets now get a subset
df.loc['A','W']

-38

In [None]:
# Lets pass a condition
df = df[df > 0]

In [None]:
df

Unnamed: 0,W,X,Y,Z
A,,27.0,84.0,61.0
B,,52.0,63.0,27.0
D,18.0,,89.0,
E,,,,


In [None]:
# Missing data is common in real world datasets 
# Leave missing data as missing for example NaN in case of categorical data
# Or we can remove the missing data
# Fill in missing data with mode, mean, median


In [None]:
df.dropna()

Unnamed: 0,W,X,Y,Z


In [None]:
df.dropna(axis=1)

A
B
D
E


In [None]:
df.dropna(axis=1, thresh=1)

Unnamed: 0,W,X,Y,Z
A,,27.0,84.0,61.0
B,,52.0,63.0,27.0
D,18.0,,89.0,
E,,,,


In [None]:
df.fillna(value = 'Fill Value')

Unnamed: 0,W,X,Y,Z
A,Fill Value,27,84,61
B,Fill Value,52,63,27
D,18,Fill Value,89,Fill Value
E,Fill Value,Fill Value,Fill Value,Fill Value


In [None]:
df.fillna(value = 0)

Unnamed: 0,W,X,Y,Z
A,0.0,27.0,84.0,61.0
B,0.0,52.0,63.0,27.0
D,18.0,0.0,89.0,0.0
E,0.0,0.0,0.0,0.0


In [None]:
df['W'].mean()

18.0

In [None]:
df['W'].fillna(value=df['W'].mean())

A    18.0
B    18.0
D    18.0
E    18.0
Name: W, dtype: float64