# Pandas
Pandas is a powerful and widely-used Python library designed for data manipulation and analysis. It is particularly well-suited for handling structured data, such as tabular data in spreadsheets or databases. Below are some key features and concepts related to Pandas:

In [1]:
# importing the pandas 
import pandas as pd

In [2]:
data =[1,2,3,4,5]
series=pd.Series(data)

In [4]:
print(series)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [5]:
# creating the series from the dectionary
data={'a':1,'b':4,'c':6}
series_dict=pd.Series(data)

In [6]:
series_dict

a    1
b    4
c    6
dtype: int64

In [7]:
# applying the index 
data=[1,3,4]
index=['a','b','c']
pd.Series(data,index=index)

a    1
b    3
c    4
dtype: int64

In [8]:
# creating the datafram a dictionary of list
data={
    'name':['Ashwani','vishal','Sahil'],
'age':[29,17,23],
'city':['Noida','Delhi','Saharanpur']
}
data_frame=pd.DataFrame(data)

In [9]:
data_frame

Unnamed: 0,name,age,city
0,Ashwani,29,Noida
1,vishal,17,Delhi
2,Sahil,23,Saharanpur


In [10]:
# creating the data frame from the List of Dictionary
data=[
    {'Name':'Ashwani','Age':29,'City':'Noida'},
    {'Name':'Vishal','Age':17,'City':'Noida'},
    {'Name':'Ashwani','Age':23,'City':'Noida'}


]
data_f=pd.DataFrame(data)

In [11]:
data_f

Unnamed: 0,Name,Age,City
0,Ashwani,29,Noida
1,Vishal,17,Noida
2,Ashwani,23,Noida


In [12]:
df=pd.read_csv('annual-co2-emissions-per-country.csv')

In [13]:
df

Unnamed: 0,Entity,Code,Year,Annual CO₂ emissions
0,Afghanistan,AFG,1949,14656.0
1,Afghanistan,AFG,1950,84272.0
2,Afghanistan,AFG,1951,91600.0
3,Afghanistan,AFG,1952,91600.0
4,Afghanistan,AFG,1953,106256.0
...,...,...,...,...
30303,Zimbabwe,ZWE,2018,10714598.0
30304,Zimbabwe,ZWE,2019,9775428.0
30305,Zimbabwe,ZWE,2020,7849639.0
30306,Zimbabwe,ZWE,2021,8396158.0


In [14]:
df.head()

Unnamed: 0,Entity,Code,Year,Annual CO₂ emissions
0,Afghanistan,AFG,1949,14656.0
1,Afghanistan,AFG,1950,84272.0
2,Afghanistan,AFG,1951,91600.0
3,Afghanistan,AFG,1952,91600.0
4,Afghanistan,AFG,1953,106256.0


The main difference between .loc and .iloc in Pandas is how they access data in a DataFrame:

- .loc is label-based, meaning it uses row and column labels.
- .iloc is integer-based, meaning it uses row and column index positions.

![image.png](attachment:image.png)

In [16]:
## Accessing the specified elements
df.at[1,'Code']

'AFG'

In [18]:
df.iat[3,3]

91600.0

In [None]:
# add the new column into the data frame
data_f['Height']=[2.3,1.2,2.1]

In [20]:
data_f.head()

Unnamed: 0,Name,Age,City,Height
0,Ashwani,29,Noida,2.3
1,Vishal,17,Noida,1.2
2,Ashwani,23,Noida,2.1


In [21]:
data_f.drop(columns='Height', inplace=True)


In [22]:
data_f

Unnamed: 0,Name,Age,City
0,Ashwani,29,Noida
1,Vishal,17,Noida
2,Ashwani,23,Noida


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30308 entries, 0 to 30307
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Entity                30308 non-null  object 
 1   Code                  24157 non-null  object 
 2   Year                  30308 non-null  int64  
 3   Annual CO₂ emissions  30308 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 947.3+ KB


In [24]:
df.describe()

Unnamed: 0,Year,Annual CO₂ emissions
count,30308.0,30308.0
mean,1940.191336,391272200.0
std,65.510232,1855825000.0
min,1750.0,0.0
25%,1902.0,183200.0
50%,1959.0,3856092.0
75%,1991.0,47277520.0
max,2022.0,37149790000.0


In [None]:
# Rename the coluumns 
df=df.rename(columns={'Annual CO₂ emissions':'CO₂ emission'})
df.head()

Unnamed: 0,Entity,Code,Year,CO₂ emission
0,Afghanistan,AFG,1949,14656.0
1,Afghanistan,AFG,1950,84272.0
2,Afghanistan,AFG,1951,91600.0
3,Afghanistan,AFG,1952,91600.0
4,Afghanistan,AFG,1953,106256.0


In [26]:
df.isnull().sum()

Entity             0
Code            6151
Year               0
CO₂ emission       0
dtype: int64

In [27]:
df['New value']=df['CO₂ emission'].astype(int)
df.head()

Unnamed: 0,Entity,Code,Year,CO₂ emission,New value
0,Afghanistan,AFG,1949,14656.0,14656
1,Afghanistan,AFG,1950,84272.0,84272
2,Afghanistan,AFG,1951,91600.0,91600
3,Afghanistan,AFG,1952,91600.0,91600
4,Afghanistan,AFG,1953,106256.0,106256


In [28]:
df['sqrt of CO2 Emission'] = df['New value'].apply(lambda x: x**0.5)
df.head()


Unnamed: 0,Entity,Code,Year,CO₂ emission,New value,sqrt of CO2 Emission
0,Afghanistan,AFG,1949,14656.0,14656,121.061968+ 0.000000j
1,Afghanistan,AFG,1950,84272.0,84272,290.296400+ 0.000000j
2,Afghanistan,AFG,1951,91600.0,91600,302.654919+ 0.000000j
3,Afghanistan,AFG,1952,91600.0,91600,302.654919+ 0.000000j
4,Afghanistan,AFG,1953,106256.0,106256,325.969324+ 0.000000j


In [29]:
group_mean=df.groupby('Code')['New value'].mean()

In [None]:
df.

In [30]:
group_mean

Code
ABW    8.022785e+05
AFG    3.118328e+06
AGO    9.301034e+06
AIA    1.008720e+05
ALB    3.320977e+06
           ...     
WSM    1.000805e+05
YEM    9.418806e+06
ZAF    1.572661e+08
ZMB    3.677525e+06
ZWE    6.655787e+06
Name: New value, Length: 217, dtype: float64

In [31]:
group_sum=df.groupby('Code')['New value'].sum()
group_sum.head()

Code
ABW     77821014
AFG    230756256
AGO    678975489
AIA      3328777
ALB    298887887
Name: New value, dtype: int64

In [35]:
# Aggregate the multiple function
grouped_agg=df.groupby('Code')['New value'].agg(['mean','count','sum'])

In [36]:
grouped_agg.head()

Unnamed: 0_level_0,mean,count,sum
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ABW,802278.5,97,77821014
AFG,3118328.0,74,230756256
AGO,9301034.0,73,678975489
AIA,100872.0,33,3328777
ALB,3320977.0,90,298887887


In [47]:
# murging and joining the data set 
df1=pd.DataFrame({'Key':['A','B','C'], 'Value1':[12,23,34]})
# murging and joining the data set 
df2=pd.DataFrame({'Key':['B','C','D'], 'Value1':[21,23,54]})

In [48]:
df1.head()

Unnamed: 0,Key,Value1
0,A,12
1,B,23
2,C,34


In [49]:
df2.head()

Unnamed: 0,Key,Value1
0,B,21
1,C,23
2,D,54


In [45]:
pd.merge(df1,df2,on='Key',how='inner')

Unnamed: 0,Key,Value1_x,Value1_y
0,A,12,21
1,B,23,23
2,C,34,54


In [50]:
pd.merge(df1,df2,on='Key',how='left')

Unnamed: 0,Key,Value1_x,Value1_y
0,A,12,
1,B,23,21.0
2,C,34,23.0


In [51]:
pd.merge(df1,df2,on='Key',how='right')

Unnamed: 0,Key,Value1_x,Value1_y
0,B,23.0,21
1,C,34.0,23
2,D,,54


In [54]:
pd.merge(df1,df2,on='Key',how='outer')

Unnamed: 0,Key,Value1_x,Value1_y
0,A,12.0,
1,B,23.0,21.0
2,C,34.0,23.0
3,D,,54.0


In [56]:
! pip install faker

Collecting faker
  Downloading Faker-33.0.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-33.0.0-py3-none-any.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   -------- ------------------------------- 0.4/1.9 MB 8.1 MB/s eta 0:00:01
   ------------------- -------------------- 0.9/1.9 MB 9.8 MB/s eta 0:00:01
   ------------------------- -------------- 1.2/1.9 MB 8.6 MB/s eta 0:00:01
   ------------------------------------ --- 1.7/1.9 MB 9.1 MB/s eta 0:00:01
   ---------------------------------------  1.9/1.9 MB 9.2 MB/s eta 0:00:01
   ---------------------------------------- 1.9/1.9 MB 8.0 MB/s eta 0:00:00
Installing collected packages: faker
Successfully installed faker-33.0.0



[notice] A new release of pip is available: 24.1.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [58]:
from faker import Faker
import pandas as pd

# Initialize Faker
fake = Faker()

# Create a sample dataset
data = {
    'Name': [fake.name() for _ in range(10)],
    'Address': [fake.address() for _ in range(10)],
    'Email': [fake.email() for _ in range(10)],
    'Phone Number': [fake.phone_number() for _ in range(10)],
    'Job': [fake.job() for _ in range(10)],
    'Salary': [fake.random_int(min=50000, max=120000) for _ in range(10)],
}

df = pd.DataFrame(data)


In [59]:
df.head()


Unnamed: 0,Name,Address,Email,Phone Number,Job,Salary
0,Donna Lopez,"798 Rubio Way Apt. 716\nJohnview, CO 12415",dperry@example.com,3657507226,Chartered certified accountant,106026
1,Eric Bailey,USS Aguilar\nFPO AE 91199,rosariostephanie@example.org,001-342-535-6677,Dentist,70323
2,Vicki Vaughn MD,"93675 Kim Land Suite 246\nLake Gina, ID 68926",amydavis@example.net,(632)878-6762x7804,Herbalist,79064
3,Melissa Klein,"2961 Shah Canyon\nRubioside, PA 87065",dawnhoffman@example.com,001-363-507-1525x13805,Higher education lecturer,59499
4,Laura Espinoza,"43537 Jennifer Creek Suite 927\nCruzmouth, IA ...",utucker@example.org,001-784-395-2666x0834,Community development worker,96202
