## Exploring Features in Detail with Series

In [206]:
# Importing the Pandas library
import pandas as pd

In [207]:
# Load the dataset from a CSV file
df = pd.read_csv("D:/myAnalyze/PANDASPLOTLY_FUNCODING_FULLDATA_20240601/00_Material(Uploaded)/COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports/04-01-2020.csv", encoding="utf-8-sig")

# Display the first 5 rows of the dataset
df.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001.0,Abbeville,South Carolina,US,2020-04-01 21:58:49,34.223334,-82.461707,4,0,0,4,"Abbeville, South Carolina, US"
1,22001.0,Acadia,Louisiana,US,2020-04-01 21:58:49,30.295065,-92.414197,47,1,0,46,"Acadia, Louisiana, US"
2,51001.0,Accomack,Virginia,US,2020-04-01 21:58:49,37.767072,-75.632346,7,0,0,7,"Accomack, Virginia, US"
3,16001.0,Ada,Idaho,US,2020-04-01 21:58:49,43.452658,-116.241552,195,3,0,192,"Ada, Idaho, US"
4,19001.0,Adair,Iowa,US,2020-04-01 21:58:49,41.330756,-94.471059,1,0,0,1,"Adair, Iowa, US"


In [208]:
# Extract a Series from the DataFrame
# - Using DataFrame["Column_Name"] to select a specific column
countries = df["Country_Region"]

# Print the type of the extracted Series
print(type(countries))  # Output: <class 'pandas.core.series.Series'>

# Display the first 5 values of the Series
countries.head()

<class 'pandas.core.series.Series'>


0    US
1    US
2    US
3    US
4    US
Name: Country_Region, dtype: object

- Size : Returns the total size of the Series (including missing values).
- count() : Returns the size excluding missing values.
- unique() : Returns only the unique values.
- value_counts() : Returns the count of each unique value, excluding missing values.

In [209]:
# Get the total size of the Series, including missing values
countries.size


2522

In [210]:
# Get the count of non-null values in the Series
countries.count()

np.int64(2522)

In [211]:
# Print the number of unique values in the Series
print(len(countries.unique()))

# Retrieve the unique values from the Series
countries.unique()

190


array(['US', 'Canada', 'United Kingdom', 'China', 'Netherlands',
       'Australia', 'Denmark', 'France', 'Afghanistan', 'Albania',
       'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina',
       'Armenia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin',
       'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana',
       'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burma', 'Burundi',
       'Cabo Verde', 'Cambodia', 'Cameroon', 'Central African Republic',
       'Chad', 'Chile', 'Colombia', 'Congo (Brazzaville)',
       'Congo (Kinshasa)', 'Costa Rica', "Cote d'Ivoire", 'Croatia',
       'Cuba', 'Cyprus', 'Czechia', 'Diamond Princess', 'Djibouti',
       'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'Gabon', 'Gambia',
       'Georgia', 'Germany', 'Ghana', 'Greece', '

In [212]:
# Count the occurrences of each unique value in the Series, excluding missing values
countries.value_counts()

Country_Region
US                      2228
China                     34
United Kingdom            18
Malaysia                  17
Canada                    16
                        ... 
Winter Olympics 2022       1
Antarctica                 1
Korea, North               1
Nauru                      1
Tuvalu                     1
Name: count, Length: 190, dtype: int64

## Extracting Only the Required Columns from the DataFrame

In [213]:
# Display the column names of the DataFrame
df.columns

Index(['FIPS', 'Admin2', 'Province_State', 'Country_Region', 'Last_Update',
       'Lat', 'Long_', 'Confirmed', 'Deaths', 'Recovered', 'Active',
       'Combined_Key'],
      dtype='object')

In [214]:
# Select only the required columns from the DataFrame
covid_stat = df[["Confirmed", "Deaths", "Recovered"]]

# Display the extracted DataFrame
covid_stat

Unnamed: 0,Confirmed,Deaths,Recovered
0,4,0,0
1,47,1,0
2,7,0,0
3,195,3,0
4,1,0,0
...,...,...,...
2517,0,198,0
2518,0,0,0
2519,0,0,0
2520,0,0,0


## Searching for Rows that Match Specific Conditions

In [215]:
# Create a boolean mask where the "Country_Region" column has the value "US"
df["Country_Region"] == "US"

0        True
1        True
2        True
3        True
4        True
        ...  
2517    False
2518    False
2519    False
2520    False
2521    False
Name: Country_Region, Length: 2522, dtype: bool

In [216]:
# Extract only the rows where the "Country_Region" column is "US"
df_US = df[df["Country_Region"] == "US"]

# Display the filtered DataFrame
df_US

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001.0,Abbeville,South Carolina,US,2020-04-01 21:58:49,34.223334,-82.461707,4,0,0,4,"Abbeville, South Carolina, US"
1,22001.0,Acadia,Louisiana,US,2020-04-01 21:58:49,30.295065,-92.414197,47,1,0,46,"Acadia, Louisiana, US"
2,51001.0,Accomack,Virginia,US,2020-04-01 21:58:49,37.767072,-75.632346,7,0,0,7,"Accomack, Virginia, US"
3,16001.0,Ada,Idaho,US,2020-04-01 21:58:49,43.452658,-116.241552,195,3,0,192,"Ada, Idaho, US"
4,19001.0,Adair,Iowa,US,2020-04-01 21:58:49,41.330756,-94.471059,1,0,0,1,"Adair, Iowa, US"
...,...,...,...,...,...,...,...,...,...,...,...,...
2246,66000.0,,Guam,US,2020-04-01 21:58:49,13.444300,144.793700,77,3,0,74,"Guam, US"
2273,,,Northern Mariana Islands,US,2020-04-01 21:58:49,15.097900,145.673900,2,0,0,2,",Northern Mariana Islands,US"
2280,,,Puerto Rico,US,2020-04-01 21:58:49,18.220800,-66.590100,286,11,0,275,"Puerto Rico, US"
2285,,,Recovered,US,2020-04-01 21:58:49,0.000000,0.000000,0,0,8474,0,"Recovered, US"


## Handling Missing Values (NaN)
- Check for missing data.
- isnull() : Identifies missing values (True or False).
- sum() : Counts the number of missing values in each column.
- Typically used together as .isnull().sum() to get the count of missing values per column.

In [217]:
# Check for missing values in the DataFrame (returns True for missing values, False otherwise)
df.isnull()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
2517,False,True,False,False,False,False,False,False,False,False,False,False
2518,True,True,True,False,False,False,False,False,False,False,False,False
2519,True,True,False,False,False,False,False,False,False,False,False,False
2520,True,True,True,False,False,False,False,False,False,False,False,False


In [218]:
# Count the number of missing values in each column of the DataFrame
df.isnull().sum()

FIPS              347
Admin2            301
Province_State    184
Country_Region      0
Last_Update         0
Lat                 4
Long_               4
Confirmed           0
Deaths              0
Recovered           0
Active              0
Combined_Key        0
dtype: int64

## Removing Missing Data: dropna() → Deletes all rows containing missing values.

In [219]:
# Remove all rows containing missing values
df_drop = df.dropna()

# Print the size of the DataFrame before and after dropping missing values
print(f"Before drop: {df.size}")
print(f"After drop: {df_drop.size}")

# Print the number of removed rows
print(f"Number of deleted rows: {int(df.size) - int(df_drop.size)}")

Before drop: 30264
After drop: 26016
Number of deleted rows: 4248


## Deleting Only Rows with Missing Values in a Specific Column: dropna(subset=["Column_Name"])

In [220]:
# Remove rows where the "Confirmed" field has null values
myDf = df.dropna(subset=["Confirmed"])

# Display the shape (number of rows and columns) of the updated DataFrame
myDf.shape

(2522, 12)

## Filling Missing Data with a Specific Value: fillna(value)

In [221]:
# Load the dataset from a CSV file
df = pd.read_csv("D:/myAnalyze/PANDASPLOTLY_FUNCODING_FULLDATA_20240601/00_Material(Uploaded)/COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports/01-22-2020.csv", encoding="utf-8-sig")

# Display the first 5 rows of the dataset
df.head()

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,Anhui,Mainland China,1/22/2020 17:00,1.0,,
1,Beijing,Mainland China,1/22/2020 17:00,14.0,,
2,Chongqing,Mainland China,1/22/2020 17:00,6.0,,
3,Fujian,Mainland China,1/22/2020 17:00,1.0,,
4,Gansu,Mainland China,1/22/2020 17:00,,,


In [222]:
# Fill all missing values in the DataFrame with 0
fillDF_1 = df.fillna(0)

# Display the first 5 rows of the updated DataFrame
fillDF_1.head()

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0


In [223]:
# Fill missing values with specific values for each column using a dictionary
nan_data = {"Deaths": 0, "Recovered": 0}

# Apply the fillna() method with the specified dictionary
fillDF_2 = df.fillna(nan_data)

# Display the first 5 rows of the updated DataFrame
fillDF_2.head()

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,Gansu,Mainland China,1/22/2020 17:00,,0.0,0.0


## Merging Data Based on a Specific Key
- groupby() : Similar to SQL's GROUP BY (column_name), groups data based on a specific column → The specified column becomes the index.
- sum() : Aggregates grouped data by summing values.

In [224]:
# Load the dataset from a CSV file
df = pd.read_csv("D:/myAnalyze/PANDASPLOTLY_FUNCODING_FULLDATA_20240601/00_Material(Uploaded)/COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports/04-01-2020.csv", encoding="utf-8-sig")

# Display the first 5 rows of the dataset
df.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001.0,Abbeville,South Carolina,US,2020-04-01 21:58:49,34.223334,-82.461707,4,0,0,4,"Abbeville, South Carolina, US"
1,22001.0,Acadia,Louisiana,US,2020-04-01 21:58:49,30.295065,-92.414197,47,1,0,46,"Acadia, Louisiana, US"
2,51001.0,Accomack,Virginia,US,2020-04-01 21:58:49,37.767072,-75.632346,7,0,0,7,"Accomack, Virginia, US"
3,16001.0,Ada,Idaho,US,2020-04-01 21:58:49,43.452658,-116.241552,195,3,0,192,"Ada, Idaho, US"
4,19001.0,Adair,Iowa,US,2020-04-01 21:58:49,41.330756,-94.471059,1,0,0,1,"Adair, Iowa, US"


In [225]:
# Group data by "Country_Region" and sum the values for each country
df_sum = df.groupby("Country_Region").sum()

# Display the first 5 rows of the aggregated DataFrame
df_sum.head()

Unnamed: 0_level_0,FIPS,Admin2,Province_State,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Afghanistan,0.0,0,0,2020-04-01 21:58:34,33.93911,67.709953,192,4,5,183,Afghanistan
Albania,0.0,0,0,2020-04-01 21:58:34,41.1533,20.1683,259,15,67,177,Albania
Algeria,0.0,0,0,2020-04-01 21:58:34,28.0339,1.6596,847,58,61,728,Algeria
Andorra,0.0,0,0,2020-04-01 21:58:34,42.5063,1.5218,390,14,10,366,Andorra
Angola,0.0,0,0,2020-04-01 21:58:34,-11.2027,17.8739,8,2,1,5,Angola


In [226]:
# Display the column names of the grouped DataFrame
df_sum.columns

Index(['FIPS', 'Admin2', 'Province_State', 'Last_Update', 'Lat', 'Long_',
       'Confirmed', 'Deaths', 'Recovered', 'Active', 'Combined_Key'],
      dtype='object')

In [227]:
# Display the index of the grouped DataFrame
df_sum.index

Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antarctica',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       ...
       'United Arab Emirates', 'United Kingdom', 'Uruguay', 'Uzbekistan',
       'Venezuela', 'Vietnam', 'West Bank and Gaza', 'Winter Olympics 2022',
       'Zambia', 'Zimbabwe'],
      dtype='object', name='Country_Region', length=190)

In [228]:
# 미국 집계 조회
df_sum[df_sum.index == "US"]

Unnamed: 0_level_0,FIPS,Admin2,Province_State,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
US,65168934.0,AbbevilleAcadiaAccomackAdaAdairAdairAdairAdams...,South CarolinaLouisianaVirginiaIdahoIowaMissou...,2020-04-01 21:58:492020-04-01 21:58:492020-04-...,82956.96013,-197553.963757,216936,5222,8474,209150,"Abbeville, South Carolina, USAcadia, Louisiana..."


In [229]:
# Retrieve the aggregated data for the United States
df_sum[df_sum.index == "US"]

Unnamed: 0_level_0,FIPS,Admin2,Province_State,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
US,65168934.0,AbbevilleAcadiaAccomackAdaAdairAdairAdairAdams...,South CarolinaLouisianaVirginiaIdahoIowaMissou...,2020-04-01 21:58:492020-04-01 21:58:492020-04-...,82956.96013,-197553.963757,216936,5222,8474,209150,"Abbeville, South Carolina, USAcadia, Louisiana..."


## Changing Column Data Types
- In pandas, data types are referred to as dtype, and the main data types include:
- object: Represents string data (Python's str) or mixed data types.
- int64: Represents integer values.
- float64: Represents floating-point (decimal) values.
- bool: Represents boolean values (True/False).
- Changing Data Type for a Series

## For a Series (Column_Name: New_Type)
- Changes the data type of a specific column.
- If the data contains missing values (NaN), an error may occur.
- For a Series, use Series.astype(new_type).

## For a DataFrame
- To rename column names, use .columns.
- Example: .columns = [New_Column1, New_Column2].

In [230]:
# Load the dataset from a CSV file
df = pd.read_csv("D:/myAnalyze/PANDASPLOTLY_FUNCODING_FULLDATA_20240601/00_Material(Uploaded)/COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports/01-22-2020.csv", encoding="utf-8-sig")

# Select only the "Country/Region" and "Confirmed" columns
new_df = df[["Country/Region", "Confirmed"]]

# Display information about the DataFrame, including column types and missing values
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76 entries, 0 to 75
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country/Region  76 non-null     object 
 1   Confirmed       66 non-null     float64
dtypes: float64(1), object(1)
memory usage: 1.3+ KB


In [231]:
# Remove rows where the "Confirmed" column has missing values
new_df = new_df.dropna(subset=["Confirmed"])

# Convert the "Confirmed" column from float to int
new_df = new_df.astype({
    "Confirmed": "int64"
})

# Display updated DataFrame information
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 66 entries, 0 to 75
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Country/Region  66 non-null     object
 1   Confirmed       66 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.5+ KB


In [232]:
# Display the first 5 rows of the updated DataFrame
new_df.head()

Unnamed: 0,Country/Region,Confirmed
0,Mainland China,1
1,Mainland China,14
2,Mainland China,6
3,Mainland China,1
5,Mainland China,26


In [233]:
# Rename the columns in the DataFrame
new_df.columns = ["Country_Region", "CONFIRMED"]

# Display the first 5 rows of the updated DataFrame
new_df.head()

Unnamed: 0,Country_Region,CONFIRMED
0,Mainland China,1
1,Mainland China,14
2,Mainland China,6
3,Mainland China,1
5,Mainland China,26


## Checking and Removing Duplicate Rows in a DataFrame
- duplicated() : Checks for duplicate rows.
- drop_duplicates() : Removes duplicate rows.
- Removing duplicates based on a specific column → subset = ["Column_Name"].
- Specify which row to keep when duplicates exist:
- First occurrence (default) → keep="first".
- Last occurrence → keep="last".

In [234]:
# Load the dataset from a CSV file
df = pd.read_csv("D:/myAnalyze/PANDASPLOTLY_FUNCODING_FULLDATA_20240601/00_Material(Uploaded)/COVID-19-master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv", encoding="utf-8-sig")

# Select only the "iso2" and "Country_Region" columns
df = df[["iso2", "Country_Region"]]

# Display information about the DataFrame, including column types and missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3560 entries, 0 to 3559
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   iso2            3557 non-null   object
 1   Country_Region  3560 non-null   object
dtypes: object(2)
memory usage: 55.8+ KB


In [235]:
# Display the entire DataFrame
df

Unnamed: 0,iso2,Country_Region
0,BW,Botswana
1,BI,Burundi
2,SL,Sierra Leone
3,AF,Afghanistan
4,AL,Albania
...,...,...
3555,US,US
3556,US,US
3557,US,US
3558,US,US


In [236]:
# Check for duplicate rows in the DataFrame
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
3555     True
3556     True
3557     True
3558     True
3559     True
Length: 3560, dtype: bool

In [237]:
# Display only the duplicate rows in the DataFrame
df[df.duplicated()]

Unnamed: 0,iso2,Country_Region
194,GB,United Kingdom
200,AU,Australia
201,AU,Australia
202,AU,Australia
203,AU,Australia
...,...,...
3555,US,US
3556,US,US
3557,US,US
3558,US,US


In [238]:
# Remove duplicate rows based on the "Country_Region" column
# Keep the first occurrence and drop the rest
df_drop_first = df.drop_duplicates(subset="Country_Region", keep="first")

# Display the updated DataFrame
df_drop_first

Unnamed: 0,iso2,Country_Region
0,BW,Botswana
1,BI,Burundi
2,SL,Sierra Leone
3,AF,Afghanistan
4,AL,Albania
...,...,...
175,ZW,Zimbabwe
199,AU,Australia
207,CA,Canada
222,CN,China


In [239]:
# Remove duplicate rows based on the "Country_Region" column
# Keep the last occurrence and drop the rest
df_drop_last = df.drop_duplicates(subset="Country_Region", keep="last")

# Display the updated DataFrame
df_drop_last

Unnamed: 0,iso2,Country_Region
0,BW,Botswana
1,BI,Burundi
2,SL,Sierra Leone
3,AF,Afghanistan
4,AL,Albania
...,...,...
198,TC,United Kingdom
206,AU,Australia
221,CA,Canada
254,CN,China


## Merging DataFrames with Pandas
- concat() : Combines two DataFrames into a single DataFrame.
- Simply connects the two DataFrames vertically (top-bottom) or horizontally (left-right).
- Syntax: pd.concat([DataFrame1, DataFrame2]).
- axis=0 : Merges from top to bottom (default).
- axis=1 : Merges from left to right.

In [240]:
# Import the Pandas library
import pandas as pd

In [241]:
# Create the first DataFrame (customer information)
a_df = pd.DataFrame({
    "id": [1, 2, 3],
    "customer_id": [1, 2, 3],
    "customer_name": ["Robert", "Peter", "Dave"]
})

# Create the second DataFrame (order information)
b_df = pd.DataFrame({
    "id": [1, 2, 4],
    "order_id": [100, 200, 300],
    "order_date": ["2021-01-21", "2021-02-03", "2020-10-01"]
})

In [242]:
a_df

Unnamed: 0,id,customer_id,customer_name
0,1,1,Robert
1,2,2,Peter
2,3,3,Dave


In [243]:
b_df

Unnamed: 0,id,order_id,order_date
0,1,100,2021-01-21
1,2,200,2021-02-03
2,4,300,2020-10-01


In [244]:
# Concatenate the two DataFrames vertically (default: axis=0)
pd.concat([a_df, b_df])

Unnamed: 0,id,customer_id,customer_name,order_id,order_date
0,1,1.0,Robert,,
1,2,2.0,Peter,,
2,3,3.0,Dave,,
0,1,,,100.0,2021-01-21
1,2,,,200.0,2021-02-03
2,4,,,300.0,2020-10-01


In [245]:
# Concatenate the two DataFrames horizontally (axis=1) to merge columns side by side
pd.concat([a_df, b_df], axis=1)

Unnamed: 0,id,customer_id,customer_name,id.1,order_id,order_date
0,1,1,Robert,1,100,2021-01-21
1,2,2,Peter,2,200,2021-02-03
2,3,3,Dave,4,300,2020-10-01


## Merging Two DataFrames
- merge(df1, df2)
- Merges two DataFrames based on a common column name.
- Use the on parameter to specify the column to merge on → merge(df1, df2, on="column_name").
- how parameter determines the type of join.

## Types of Joins (how parameter)
- inner (Inner Join)
- Similar to SQL Inner Join.
- ncludes only matching rows from both DataFrames based on the on column.
- Non-matching rows are discarded.
![image.png](attachment:image.png)
- (2)outer (Full Outer Join)
- Similar to SQL Outer Join.
- Includes all rows from both DataFrames.
- If there is no match, missing values are filled with NaN.
![image-2.png](attachment:image-2.png)
- (3)left (Left Outer Join)
- Similar to SQL Left Outer Join.
- Includes all rows from the left DataFrame and matching rows from the right DataFrame.
- Non-matching right-side rows are filled with NaN.
![image-3.png](attachment:image-3.png)
- (4)right (Right Outer Join)
- Similar to SQL Right Outer Join.
- Includes all rows from the right DataFrame and matching rows from the left DataFrame.
- Non-matching left-side rows are filled with NaN.
![image-4.png](attachment:image-4.png)

## Merging Based on Index Instead of a Column:
- Use merge(df1, df2, left_index=True, right_index=True) to merge DataFrames using their index values as the key.
- This allows merging without specifying a column explicitly.

In [246]:
# Merge two DataFrames using the default "inner" join (same as pd.merge(a_df, b_df, how="inner"))
pd.merge(a_df, b_df)

Unnamed: 0,id,customer_id,customer_name,order_id,order_date
0,1,1,Robert,100,2021-01-21
1,2,2,Peter,200,2021-02-03


In [247]:
# Merge two DataFrames using the "inner" join explicitly (default behavior)
pd.merge(a_df, b_df, how="inner")

Unnamed: 0,id,customer_id,customer_name,order_id,order_date
0,1,1,Robert,100,2021-01-21
1,2,2,Peter,200,2021-02-03


In [248]:
# Merge two DataFrames using "id" as the common key column with the default "inner" join
pd.merge(a_df, b_df, on="id")

Unnamed: 0,id,customer_id,customer_name,order_id,order_date
0,1,1,Robert,100,2021-01-21
1,2,2,Peter,200,2021-02-03


In [249]:
# Perform an outer join on the "id" column to include all rows from both DataFrames
pd.merge(a_df, b_df, on="id", how="outer")

Unnamed: 0,id,customer_id,customer_name,order_id,order_date
0,1,1.0,Robert,100.0,2021-01-21
1,2,2.0,Peter,200.0,2021-02-03
2,3,3.0,Dave,,
3,4,,,300.0,2020-10-01


In [250]:
# Perform a left join on the "id" column, keeping all rows from the left DataFrame (a_df)
pd.merge(a_df, b_df, on="id", how="left")

Unnamed: 0,id,customer_id,customer_name,order_id,order_date
0,1,1,Robert,100.0,2021-01-21
1,2,2,Peter,200.0,2021-02-03
2,3,3,Dave,,


In [251]:
# Perform a right join on the "id" column, keeping all rows from the right DataFrame (b_df)
pd.merge(a_df, b_df, on="id", how="right")

Unnamed: 0,id,customer_id,customer_name,order_id,order_date
0,1,1.0,Robert,100,2021-01-21
1,2,2.0,Peter,200,2021-02-03
2,4,,,300,2020-10-01


In [252]:
# Set the "id" column as the index in the DataFrame
a_df = a_df.set_index("id")

# Display the updated DataFrame with "id" as the index
a_df

Unnamed: 0_level_0,customer_id,customer_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,Robert
2,2,Peter
3,3,Dave


In [253]:
# Set the "id" column as the index in the DataFrame
b_df = b_df.set_index("id")

# Display the updated DataFrame with "id" as the index
b_df

Unnamed: 0_level_0,order_id,order_date
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,100,2021-01-21
2,200,2021-02-03
4,300,2020-10-01


In [254]:
# Perform an inner join using the index as the key instead of a column
pd.merge(a_df, b_df, how="inner", left_index=True, right_index=True)

Unnamed: 0_level_0,customer_id,customer_name,order_id,order_date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,Robert,100,2021-01-21
2,2,Peter,200,2021-02-03


In [255]:
# Perform an outer join using the index as the key instead of a column
pd.merge(a_df, b_df, how="outer", left_index=True, right_index=True)

Unnamed: 0_level_0,customer_id,customer_name,order_id,order_date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,Robert,100.0,2021-01-21
2,2.0,Peter,200.0,2021-02-03
3,3.0,Dave,,
4,,,300.0,2020-10-01
