# PANDAS --  confidently load, explore, clean, transform, combine, and export datasets.

## DataFrame Creation

In [2]:
import pandas as pd

.read_csv("file_name") or .read_csv("path_wherefile_is") is used to read a csv file

In [3]:
r_cSv = pd.read_csv("earth.csv")

.read_excel("file_name") or .read_excel("path_wherefile_is") is used to read a excel file

In [4]:
r_Ex = pd.read_excel("water.xlsx")

## Indexing and Slicing (Loc and iloc)

In [5]:
people = pd.DataFrame({
    "Name" : ['Sumit','Sahil','Ankit'],
    "Age" : [18,19,20],
    "city" : ["uk","ny","nj"]
} , index = ['u1','u2','u3'])
print(people)

     Name  Age city
u1  Sumit   18   uk
u2  Sahil   19   ny
u3  Ankit   20   nj


In [6]:
people.loc['u1'] #row with index u1

Name    Sumit
Age        18
city       uk
Name: u1, dtype: object

In [7]:
people.loc[:,"Name"] #column by label

u1    Sumit
u2    Sahil
u3    Ankit
Name: Name, dtype: object

In [8]:
people.iloc[0:3] # it uses indexes for finding the position

Unnamed: 0,Name,Age,city
u1,Sumit,18,uk
u2,Sahil,19,ny
u3,Ankit,20,nj


In [9]:
people.iloc[:,2]

u1    uk
u2    ny
u3    nj
Name: city, dtype: object

In [10]:
people.iloc[2,:]

Name    Ankit
Age        20
city       nj
Name: u3, dtype: object

In [11]:
people.iloc[0:2,0:2]

Unnamed: 0,Name,Age
u1,Sumit,18
u2,Sahil,19


## DATA CLEANING

dropna(): remove rows/columns with missing data; supports axis, how, subset, thresh.

fillna(): replace missing values with constants or methods (ffill/bfill).

drop(): remove rows by label or columns by name.

rename(): rename columns or index labels.

isna()/notna(): detect missing values.

drop_duplicates(): remove duplicate rows.

In [13]:
import numpy as np

df = pd.DataFrame({
    "A": [1, 2, 3, np.nan, 5],
    "B": [np.nan, 2, 3, 4, 5],
    "C": [1, 2, np.nan, np.nan, 5]
})
print(df)

     A    B    C
0  1.0  NaN  1.0
1  2.0  2.0  2.0
2  3.0  3.0  NaN
3  NaN  4.0  NaN
4  5.0  5.0  5.0


In [15]:
missing_count = df.isna()
print(missing_count)

       A      B      C
0  False   True  False
1  False  False  False
2  False  False   True
3   True  False   True
4  False  False  False


In [16]:
# clean1 = df.dropna() ---> it remove all the rows with NaN or no values
# print(clean1)

     A    B    C
1  2.0  2.0  2.0
4  5.0  5.0  5.0


In [18]:
fill_miss = df.fillna({'A' : 4 , 'B' : 1 , 'C' : "unknown"})
print(fill_miss)

     A    B        C
0  1.0  1.0      1.0
1  2.0  2.0      2.0
2  3.0  3.0  unknown
3  4.0  4.0  unknown
4  5.0  5.0      5.0


In [21]:
Drrop = fill_miss.drop(columns = ['A'])
print(Drrop)

     B        C
0  1.0      1.0
1  2.0      2.0
2  3.0  unknown
3  4.0  unknown
4  5.0      5.0


In [23]:
rename_1 = fill_miss.rename(columns = {'B' : 'Great'})
print(rename_1)

     A  Great        C
0  1.0    1.0      1.0
1  2.0    2.0      2.0
2  3.0    3.0  unknown
3  4.0    4.0  unknown
4  5.0    5.0      5.0


In [24]:
dup = rename_1.drop_duplicates(subset = ["Great"])
print(dup)

     A  Great        C
0  1.0    1.0      1.0
1  2.0    2.0      2.0
2  3.0    3.0  unknown
3  4.0    4.0  unknown
4  5.0    5.0      5.0


## Aggregation & sorting

In [25]:
sales = pd.DataFrame({
    "region": ["East", "East", "West", "West", "East"],
    "product": ["A", "B", "A", "B", "A"],
    "revenue": [100, 150, 200, 50, 120],
    "units": [10, 15, 20, 5, 12]
})
print(sales)

  region product  revenue  units
0   East       A      100     10
1   East       B      150     15
2   West       A      200     20
3   West       B       50      5
4   East       A      120     12


In [26]:
agg = sales.groupby("region").agg(
    revenue_sum=("revenue", "sum"),
    revenue_mean=("revenue", "mean"),
    units_std=("units", "std")
).reset_index()
print(agg)

  region  revenue_sum  revenue_mean  units_std
0   East          370    123.333333   2.516611
1   West          250    125.000000  10.606602


In [35]:
sorti = sales.sort_values(by = "revenue" , ascending = False) # sort the values by revenue in descending order
print(sorti)

  region product  revenue  units
2   West       A      200     20
1   East       B      150     15
4   East       A      120     12
0   East       A      100     10
3   West       B       50      5


In [38]:
sorti_multi = sales.sort_values(by=["revenue" , "units"] , ascending = [False , True]) # multiple parameters sorting
print(sorti_multi)

  region product  revenue  units
2   West       A      200     20
1   East       B      150     15
4   East       A      120     12
0   East       A      100     10
3   West       B       50      5


In [39]:
avg_revenue = sales["revenue"].mean() # it access the values of revenue and return the mean value.
std_revenue = sales["revenue"] .std() # it access the values of revenue and do its standard deviation.
print(avg_revenue)
print(std_revenue)

124.0
55.94640292279746


In [43]:
pivot = pd.pivot_table(
    sales,
    index="region",
    columns="product",
    values="revenue",
    aggfunc="sum",
    fill_value=0
)
print(pivot)

product    A    B
region           
East     220  150
West     200   50
