<a href="https://colab.research.google.com/github/Anjasfedo/data-analysis/blob/main/Pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [2]:
import pandas as pd

# Intro to DataFrame

In [10]:
df = pd.DataFrame([[1,2,3],
                   [4,5,6],
                   [6,7,8]],
                  columns=["A","B","C"],
                  index=["x", "y", "z"])

In [11]:
# Look head
df.head()

Unnamed: 0,A,B,C
x,1,2,3
y,4,5,6
z,6,7,8


In [12]:
# Look tail
df.tail()

Unnamed: 0,A,B,C
x,1,2,3
y,4,5,6
z,6,7,8


In [13]:
# Look the header (columns)
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [14]:
# Take indexing
df.index.tolist()

['x', 'y', 'z']

In [16]:
# Look the information
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, x to z
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       3 non-null      int64
 1   B       3 non-null      int64
 2   C       3 non-null      int64
dtypes: int64(3)
memory usage: 204.0+ bytes


In [17]:
# Describing data
df.describe()

Unnamed: 0,A,B,C
count,3.0,3.0,3.0
mean,3.666667,4.666667,5.666667
std,2.516611,2.516611,2.516611
min,1.0,2.0,3.0
25%,2.5,3.5,4.5
50%,4.0,5.0,6.0
75%,5.0,6.0,7.0
max,6.0,7.0,8.0


In [19]:
# Find number of unique values
df.nunique()

A    3
B    3
C    3
dtype: int64

In [20]:
# Look unique value on specific column
df["A"].unique()

array([1, 4, 6])

In [21]:
# Know the shape
df.shape

(3, 3)

In [22]:
# Then the size
df.size

9

In [25]:
# Get null value
df.isnull()

Unnamed: 0,A,B,C
x,False,False,False
y,False,False,False
z,False,False,False


# Loading in DataFrame from Files

## Download the file

In [26]:
import requests

def download_file(url, filename):
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, 'wb') as file:
            file.write(response.content)

In [28]:
download_file(url="https://raw.githubusercontent.com/KeithGalli/complete-pandas-tutorial/master/warmup-data/coffee.csv",
              filename="coffee.csv")

## Load to DataFrame

In [29]:
coffee = pd.read_csv("coffee.csv")

coffee.head()

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35


## Load directly from url

In [75]:
coffee = pd.read_csv("https://raw.githubusercontent.com/KeithGalli/complete-pandas-tutorial/master/warmup-data/coffee.csv")

coffee.head()

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35


#### CSV, feather, perquet
based on the large of file CSV > feather > perquet

In [76]:
bios = pd.read_csv("https://raw.githubusercontent.com/KeithGalli/complete-pandas-tutorial/master/data/bios.csv")

bios.head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
0,1,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02
1,2,Arnaud Boetsch,1969-04-01,Meulan,Yvelines,FRA,France,183.0,76.0,
2,3,Jean Borotra,1898-08-13,Biarritz,Pyrénées-Atlantiques,FRA,France,183.0,76.0,1994-07-17
3,4,Jacques Brugnon,1895-05-11,Paris VIIIe,Paris,FRA,France,168.0,64.0,1978-03-20
4,5,Albert Canet,1878-04-17,Wandsworth,England,GBR,France,,,1930-07-25


# Accessing Data

In [77]:
coffee.head()

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35


In [41]:
coffee.tail()

Unnamed: 0,Day,Coffee Type,Units Sold
9,Friday,Latte,35
10,Saturday,Espresso,45
11,Saturday,Latte,35
12,Sunday,Espresso,45
13,Sunday,Latte,35


In [78]:
# Take sample
coffee.sample(5, random_state=42)

Unnamed: 0,Day,Coffee Type,Units Sold
9,Friday,Latte,35
11,Saturday,Latte,35
0,Monday,Espresso,25
12,Sunday,Espresso,45
5,Wednesday,Latte,25


In [79]:
# loc, filter by row and column [row, col]
coffee.loc[[0]]

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25


In [80]:
coffee.loc[[0,1,2]]

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30


In [81]:
coffee.loc[0:2]

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30


In [82]:
coffee.loc[5:7, ["Day", "Coffee Type"]]

Unnamed: 0,Day,Coffee Type
5,Wednesday,Latte
6,Thursday,Espresso
7,Thursday,Latte


In [83]:
# iloc, only access index
coffee.iloc[0:5, [0,2]]

Unnamed: 0,Day,Units Sold
0,Monday,25
1,Monday,15
2,Tuesday,30
3,Tuesday,20
4,Wednesday,35


In [84]:
# Access & Change
coffee.loc[1, "Units Sold"] = 10

In [85]:
coffee.head()

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,10
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35


In [86]:
# Grab specific element
coffee.at[0, "Units Sold"]

25

In [87]:
# Specific element only use index
coffee.iat[0, 2]

25

In [90]:
coffee["Day"]

0        Monday
1        Monday
2       Tuesday
3       Tuesday
4     Wednesday
5     Wednesday
6      Thursday
7      Thursday
8        Friday
9        Friday
10     Saturday
11     Saturday
12       Sunday
13       Sunday
Name: Day, dtype: object

In [92]:
coffee.sort_values("Units Sold", ascending=False)

Unnamed: 0,Day,Coffee Type,Units Sold
8,Friday,Espresso,45
10,Saturday,Espresso,45
12,Sunday,Espresso,45
6,Thursday,Espresso,40
4,Wednesday,Espresso,35
9,Friday,Latte,35
11,Saturday,Latte,35
13,Sunday,Latte,35
2,Tuesday,Espresso,30
7,Thursday,Latte,30


In [94]:
coffee.sort_values(["Day", "Units Sold"], ascending=[1, 0])

Unnamed: 0,Day,Coffee Type,Units Sold
8,Friday,Espresso,45
9,Friday,Latte,35
0,Monday,Espresso,25
1,Monday,Latte,10
10,Saturday,Espresso,45
11,Saturday,Latte,35
12,Sunday,Espresso,45
13,Sunday,Latte,35
6,Thursday,Espresso,40
7,Thursday,Latte,30


In [96]:
for index, row in coffee.iterrows():
    print(index)
    print(row)
    print()

0
Day              Monday
Coffee Type    Espresso
Units Sold           25
Name: 0, dtype: object

1
Day            Monday
Coffee Type     Latte
Units Sold         10
Name: 1, dtype: object

2
Day             Tuesday
Coffee Type    Espresso
Units Sold           30
Name: 2, dtype: object

3
Day            Tuesday
Coffee Type      Latte
Units Sold          20
Name: 3, dtype: object

4
Day            Wednesday
Coffee Type     Espresso
Units Sold            35
Name: 4, dtype: object

5
Day            Wednesday
Coffee Type        Latte
Units Sold            25
Name: 5, dtype: object

6
Day            Thursday
Coffee Type    Espresso
Units Sold           40
Name: 6, dtype: object

7
Day            Thursday
Coffee Type       Latte
Units Sold           30
Name: 7, dtype: object

8
Day              Friday
Coffee Type    Espresso
Units Sold           45
Name: 8, dtype: object

9
Day            Friday
Coffee Type     Latte
Units Sold         35
Name: 9, dtype: object

10
Day            Saturday
Co