## Pandas
* Pandas is a library used for data manipulation and analysis in Python.
* It has functions for analyzing, cleaning, exploring, and visualizing data.
* Pandas allow us to analize big data sets easily and efficiently.


In [1]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd

mydataset = {
  'cars': ["Ram", "Shyam", "Bibek"],
  'Marks': [30, 70, 20]
}

myvar = pd.DataFrame(mydataset)

print(myvar)

    cars  Marks
0    Ram     30
1  Shyam     70
2  Bibek     20


In [7]:
# Series 
# pandas series are like column in a table.
# it is 1d aray holding data of any type.

a = [1, 7, 2]
myvar = pd.Series(a)
print(myvar)
print(myvar[1])


0    1
1    7
2    2
dtype: int64
7


In [8]:
# index argument is used to name the indexes.
a = [1, 7, 2]
myvar = pd.Series(a, index = ["x", "y", "z"])
print(myvar)

x    1
y    7
z    2
dtype: int64


In [9]:
print(myvar["y"])

7


In [12]:
# can alsoo be made from dictionary
calories = {"ram": 420, "shyam": 380, "deepak": 390}
myvar = pd.Series(calories)
print(myvar)

ram       420
shyam     380
deepak    390
dtype: int64


In [13]:
# Data frames 
# Data frame is a 2d data structure, like a table with rows and columns.
data = {
  "length": [420, 380, 390],
  "time": [5, 4, 4.5]
}
myvar = pd.DataFrame(data)
print(myvar)

   length  time
0     420   5.0
1     380   4.0
2     390   4.5


In [14]:
data = {
    "Subject": ['Math', 'Science', 'English'],
    "Marks": [85, 90, 78]
}
myvar = pd.DataFrame(data, index=["Ram", "Shyam", "Bibek"])
print(myvar)

       Subject  Marks
Ram       Math     85
Shyam  Science     90
Bibek  English     78


### Reading CSV File Using Pandas
```python
ab = pd.read_csv('filename.csv')
```


In [10]:
import pandas as pd
df=pd.read_csv('data.csv')
print(df)


     Name  Age   City  Salary
0    anar    2  japan   70000
1  shayau  253  nepal   65000
2    kera  315  pluto   80000
3     aap   92    sun   72000


In [None]:
df = pd.read_csv('data.csv')

print(df.head(2))
# prints first 2 rows of the dataframe

     Name  Age   City  Salary
0    anar    2  japan   70000
1  shayau  253  nepal   65000


In [None]:
print(df.tail(3))
# prints last 3 rows of the dataframe

     Name  Age   City  Salary
1  shayau  253  nepal   65000
2    kera  315  pluto   80000
3     aap   92    sun   72000


In [5]:
print(pd.options.display.max_rows)
# this returns the maximum number of rows to display when printing a DataFrame. The default value is 60.
# we can change it using:
pd.options.display.max_rows = 100

60


In [8]:
# Reading JSON File Using Pandas
df = pd.read_json('dataa.json')
print(df.to_string())

    name  age  city                      email
0  Bibek   20  Ilam  bibekghimire773@gmail.com


In [28]:
data = {
    'Name': ['Ram', 'Shyam', 'Bibek'],
    'Age': [20, 21, 19],
    'City': ['Kathmandu', 'Lalitpur', 'Bhaktapur']
    }
df = pd.DataFrame(data)
print(df)

    Name  Age       City
0    Ram   20  Kathmandu
1  Shyam   21   Lalitpur
2  Bibek   19  Bhaktapur


In [32]:
## cleaning data
df = pd.read_csv('clean.csv')
new_df = df.dropna()
print(new_df)

        Duration        Date  Pulse  Maxpulse  Calories
0             60  2020/12/01    110       130     409.1
1             60  2020/12/02    117       145     479.0
2             60  2020/12/03    103       135     340.0
3             45  2020/12/04    109       175     282.4
4             45  2020/12/05    117       148     406.0
5             60  2020/12/06    102       127     300.0
6             60  2020/12/07    110       136     374.0
7            450  2020/12/08    104       134     253.3
8             30  2020/12/09    109       133     195.1
9             60  2020/12/10     98       124     269.0
10            60  2020/12/11    103       147     329.3
11            60  2020/12/12    100       120     250.7
12            60  2020/12/12    100       120     250.7
13            60  2020/12/13    106       128     345.3
14            60  2020/12/14    104       132     379.3
15            60  2020/12/15     98       123     275.0
16            60  2020/12/16     98       120   

In [33]:
df.dropna(inplace=True )# to make the changes in the original dataframe


df.fillna(130, inplace=True) # to fill the missing values with a specific value
df.fillna({"Calories": 130}, inplace=True) # to fill missing values with different values for different columns
print(df)

        Duration        Date  Pulse  Maxpulse  Calories
0             60  2020/12/01    110       130     409.1
1             60  2020/12/02    117       145     479.0
2             60  2020/12/03    103       135     340.0
3             45  2020/12/04    109       175     282.4
4             45  2020/12/05    117       148     406.0
5             60  2020/12/06    102       127     300.0
6             60  2020/12/07    110       136     374.0
7            450  2020/12/08    104       134     253.3
8             30  2020/12/09    109       133     195.1
9             60  2020/12/10     98       124     269.0
10            60  2020/12/11    103       147     329.3
11            60  2020/12/12    100       120     250.7
12            60  2020/12/12    100       120     250.7
13            60  2020/12/13    106       128     345.3
14            60  2020/12/14    104       132     379.3
15            60  2020/12/15     98       123     275.0
16            60  2020/12/16     98       120   