# Pandas
Pandas is a powerful data manipulation library in Python, widely used for data analysis and data cleaning. It provides two primary data structures: Series and DataFrame. A __Series__ is a one-dimensional array-like object, while a __DataFrame__ is a two-dimensional, size-mutable, and potentially heterogeneous tabular data structure with labeled axes (rows and columns).

In [1]:
import pandas as pd

### Series
one-dimensional array-like object,It is similar to a column in a table.


In [2]:
'''Creating series (basic method)'''

S1 = pd.Series([1,2,3,4,5]) 
S1

# 0,1,2,3,4 are default indices and 1,2,3,4,5 be their defualt values

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [3]:
'''Creating Series from dictionary '''
# Keys will become the indices

data = {"Name":"Arun","Age":18,"college":"Usar"}
S2 = pd.Series(data)

S2

Name       Arun
Age          18
college    Usar
dtype: object

In [4]:
'''Giving index and values separately'''
d1 = ["Name","age","college"]
d2 = ["Arun",18,"Usar"]
S3 = pd.Series(d2,index = d1)

S3

Name       Arun
age          18
college    Usar
dtype: object

### DataFrame

Series with more than one columns

In [5]:
'''Creating Df from dictionary of list'''
data={
    'Name':['Krish','John','Jack'],
    'Age':[25,30,45],
    'City':['Bangalore','New York','Florida']
}
df=pd.DataFrame(data)
print(type(df))
df


# Keys becomes the name of columns

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Name,Age,City
0,Krish,25,Bangalore
1,John,30,New York
2,Jack,45,Florida


In [6]:
'''Create a Data frame From a List of Dictionaries'''

data=[
    {'Name':'Krish','Age':32,'City':'Bangalore'},
    {'Name':'John','Age':34,'City':'Bangalore'},
    {'Name':'Bappy','Age':32,'City':'Bangalore'},
    {'Name':'JAck','Age':32,'City':'Bangalore'}
    
]
df=pd.DataFrame(data)
print(type(df))
print(df,"\n")

# If we give the name of keys different in every dict, then itll create new columns of that name
data=[
    {'Name':'Krish','Age':32,'City':'Bangalore'},
    {'Name':'John','Age':34,'City':'Bangalore'},
    {'Name':'Bappy','Age':32,'City':'Bangalore'},
    {'Name':'JAck','Age':32,'City':'Bangalore'}
    
]
df=pd.DataFrame(data)
print(df)
print(type(df))

<class 'pandas.core.frame.DataFrame'>
    Name  Age       City
0  Krish   32  Bangalore
1   John   34  Bangalore
2  Bappy   32  Bangalore
3   JAck   32  Bangalore 

    Name  Age       City
0  Krish   32  Bangalore
1   John   34  Bangalore
2  Bappy   32  Bangalore
3   JAck   32  Bangalore
<class 'pandas.core.frame.DataFrame'>


In [7]:
# To assign column name
import numpy as np

dfc = pd.DataFrame(np.random.randint(1,20,size = (3,3)),columns=['A','B','C'])
dfc

Unnamed: 0,A,B,C
0,16,5,18
1,15,12,11
2,7,13,15


In [8]:
# To assign rows name

dfr = pd.DataFrame(np.random.randint(1,20,size = (3,3)),index=['A','B','C'])
dfr

Unnamed: 0,0,1,2
A,12,15,14
B,6,1,2
C,8,1,3


In [9]:
'''Accessing Elements'''

df

Unnamed: 0,Name,Age,City
0,Krish,32,Bangalore
1,John,34,Bangalore
2,Bappy,32,Bangalore
3,JAck,32,Bangalore


In [10]:
# To get single column
df["Name"]  

0    Krish
1     John
2    Bappy
3     JAck
Name: Name, dtype: object

In [11]:
# To get multiple columns
df[["Name","City"]]

Unnamed: 0,Name,City
0,Krish,Bangalore
1,John,Bangalore
2,Bappy,Bangalore
3,JAck,Bangalore


In [12]:
# Using .loc method (its gives location based on the name of index we've given)

print(df.loc[1],"\n")      # Single row data (Series)
print(df.loc[0:2],"\n")         # Multiple rows data

print(df.loc[:][["Name","Age"]])

Name         John
Age            34
City    Bangalore
Name: 1, dtype: object 

    Name  Age       City
0  Krish   32  Bangalore
1   John   34  Bangalore
2  Bappy   32  Bangalore 

    Name  Age
0  Krish   32
1   John   34
2  Bappy   32
3   JAck   32


In [13]:
# Using .iloc function (its gives location based on integer as index)

print(df.iloc[1],"\n")       
print(df.iloc[0:3],"\n")

print(df.iloc[:,0:2])

Name         John
Age            34
City    Bangalore
Name: 1, dtype: object 

    Name  Age       City
0  Krish   32  Bangalore
1   John   34  Bangalore
2  Bappy   32  Bangalore 

    Name  Age
0  Krish   32
1   John   34
2  Bappy   32
3   JAck   32


In [14]:
''' To get specified elements onli '''

# Using .at function
                    #This gives elements based on name of index
print(df.at[1,"Age"])  

# Using .iat function
                    #This gives elements based on integer as index
print(df.iat[1,1])


34
34


Note : Panda library can read so many files and extract,clean,manipulate that data

In [15]:
csv_data = pd.read_csv("data.csv")

csv_data

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,2023-01-01,A,28.0,Product1,754.0,East
1,2023-01-02,B,39.0,Product3,110.0,North
2,2023-01-03,C,32.0,Product2,398.0,East
3,2023-01-04,B,8.0,Product1,522.0,East
4,2023-01-05,B,26.0,Product3,869.0,North
5,2023-01-06,B,54.0,Product3,192.0,West
6,2023-01-07,A,16.0,Product1,936.0,East
7,2023-01-08,C,89.0,Product1,488.0,West
8,2023-01-09,C,37.0,Product3,772.0,West
9,2023-01-10,A,22.0,Product2,834.0,West


In [16]:
csv_data.head()

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,2023-01-01,A,28.0,Product1,754.0,East
1,2023-01-02,B,39.0,Product3,110.0,North
2,2023-01-03,C,32.0,Product2,398.0,East
3,2023-01-04,B,8.0,Product1,522.0,East
4,2023-01-05,B,26.0,Product3,869.0,North


In [17]:
csv_data.tail()

Unnamed: 0,Date,Category,Value,Product,Sales,Region
45,2023-02-15,B,99.0,Product2,599.0,West
46,2023-02-16,B,6.0,Product1,938.0,South
47,2023-02-17,B,69.0,Product3,143.0,West
48,2023-02-18,C,65.0,Product3,182.0,North
49,2023-02-19,C,11.0,Product3,708.0,North


In [18]:
csv_data.describe()    # This gives all the statistical data about the dataframe

Unnamed: 0,Value,Sales
count,47.0,46.0
mean,51.744681,557.130435
std,29.050532,274.598584
min,2.0,108.0
25%,27.5,339.0
50%,54.0,591.5
75%,70.0,767.5
max,99.0,992.0


### Data Manipulation

In [19]:
df

Unnamed: 0,Name,Age,City
0,Krish,32,Bangalore
1,John,34,Bangalore
2,Bappy,32,Bangalore
3,JAck,32,Bangalore


In [20]:
# Adding a new column / Changing data of any column

df["Salary"]=[10000,20000,30000,None]
df

Unnamed: 0,Name,Age,City,Salary
0,Krish,32,Bangalore,10000.0
1,John,34,Bangalore,20000.0
2,Bappy,32,Bangalore,30000.0
3,JAck,32,Bangalore,


In [21]:
# Adding a new row / Changing data of any row

df.loc[4] = ["Arun",18,"Delhi",5000]    # you can use .loc .iloc any 
df

Unnamed: 0,Name,Age,City,Salary
0,Krish,32,Bangalore,10000.0
1,John,34,Bangalore,20000.0
2,Bappy,32,Bangalore,30000.0
3,JAck,32,Bangalore,
4,Arun,18,Delhi,5000.0


In [22]:
# Removing an row

df.drop(0)     # zeroth row is deleted, but this action is not permanent 

Unnamed: 0,Name,Age,City,Salary
1,John,34,Bangalore,20000.0
2,Bappy,32,Bangalore,30000.0
3,JAck,32,Bangalore,
4,Arun,18,Delhi,5000.0


In [23]:
# Removing an column

df.drop("Salary",axis = 1)   # by-default axis is set to "0" which is row axis

Unnamed: 0,Name,Age,City
0,Krish,32,Bangalore
1,John,34,Bangalore
2,Bappy,32,Bangalore
3,JAck,32,Bangalore
4,Arun,18,Delhi


In [24]:
df    # Check that, the changed data was not saved permanently

Unnamed: 0,Name,Age,City,Salary
0,Krish,32,Bangalore,10000.0
1,John,34,Bangalore,20000.0
2,Bappy,32,Bangalore,30000.0
3,JAck,32,Bangalore,
4,Arun,18,Delhi,5000.0


In [25]:
# Saving the changed data after using .drop function

df.drop("Salary",axis = 1,inplace = True)
df

Unnamed: 0,Name,Age,City
0,Krish,32,Bangalore
1,John,34,Bangalore
2,Bappy,32,Bangalore
3,JAck,32,Bangalore
4,Arun,18,Delhi


1. Create a Pandas DataFrame with 2 columns: 'Category' and 'Value'. Fill the 'Category' column with random categories ('A', 'B', 'C') and the 'Value' column with random integers. Group the DataFrame by 'Category' and compute the sum and mean of 'Value' for each category.

In [26]:
# Create a Pandas DataFrame with 2 columns: 'Category' and 'Value'
df = pd.DataFrame({'Category': np.random.choice(['A', 'B', 'C'], size=10), 'Value': np.random.randint(1, 100, size=10)})
print("Original DataFrame:")
print(df)

# Group the DataFrame by 'Category' and compute the sum and mean of 'Value' for each category
grouped = df.groupby('Category')['Value'].agg(['sum', 'mean'])
print("Grouped DataFrame:")
print(grouped)

Original DataFrame:
  Category  Value
0        A     42
1        B     51
2        A     55
3        B     42
4        C     61
5        C     83
6        B     23
7        C     61
8        C     91
9        A     43
Grouped DataFrame:
          sum       mean
Category                
A         140  46.666667
B         116  38.666667
C         296  74.000000


### Multi-indexing

In [37]:
indicis = pd.MultiIndex.from_product([['A','B'],['x','y']],names = ['Category','subcategory'])
columns = ['Value1','Value2']
df_data = np.random.randint(1,21,size = (4,2))

df = pd.DataFrame(df_data,index = indicis,columns=columns)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Value1,Value2
Category,subcategory,Unnamed: 2_level_1,Unnamed: 3_level_1
A,x,17,19
A,y,2,1
B,x,20,8
B,y,18,5


#### Multi-index dataframe slicing

In [None]:
df.loc['A']   # To get a level

Unnamed: 0_level_0,Value1,Value2
subcategory,Unnamed: 1_level_1,Unnamed: 2_level_1
x,17,19
y,2,1


In [None]:
df.loc['A','x'] # To get particular row

Value1    17
Value2    19
Name: (A, x), dtype: int32

In [44]:
df.loc[('A','x'),'Value1'] # To get particular data

17