In [3]:
# Powerful data manipulation and analysis library built on top of NumPy,
# offering data structures like DataFrames for handling structured data.
# Use Cases: Data cleaning, transformation, and analysis in data science and machine
# learning workflows.
import pandas as pd

df_file = pd.read_csv("data.csv")  # Read data from a CSV file
df_file.to_csv("output.csv", index=False)  # Write DataFrame to a CSV file

df = pd.DataFrame(
    {
        "Name": ["Alice", "Bob", "Charlie"],
        "Age": [25, 30, 35],
        "City": ["New York", "Los Angeles", "Chicago"],
    }
)
df2 = pd.DataFrame(
    {"Name": ["David", "Eva"], "Age": [28, 22], "City": ["Miami", "Seattle"]}
)

print(df.head())  # Display the first few rows of the DataFrame
#       Name  Age         City
# 0    Alice   25     New York
# 1      Bob   30  Los Angeles
# 2  Charlie   35      Chicago

print(df.tail())  # Display the last few rows of the DataFrame
#       Name  Age         City
# 0    Alice   25     New York
# 1      Bob   30  Los Angeles
# 2  Charlie   35      Chicago

print(df.info())  # Get a summary of the DataFrame
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 3 entries, 0 to 2
# Data columns (total 3 columns):
#  #   Column  Non-Null Count  Dtype
# ---  ------  --------------  -----
#  0   Name    3 non-null      object
#  1   Age     3 non-null      int64
#  2   City    3 non-null      object
# dtypes: int64(1), object(2)
# memory usage: 120.0+ bytes

print(df.describe())  # Get descriptive statistics for numerical columns
#              Age
# count   3.000000
# mean   30.000000
# std     5.000000
# min    25.000000
# 25%    27.500000
# 50%    30.000000
# 75%    32.500000
# max    35.000000

print(df.shape)  # Get the dimensions of the DataFrame
# (3, 3)

print(df.dtypes)  # Get the data types of each column
# Name     object
# Age      int64
# City     object
# dtype: object

print(df["Name"].unique())  # Get unique values in the 'Name' column
# ['Alice' 'Bob' 'Charlie']

print(df.isnull())  # Check for missing values in each column
# Name    False
# Age     False
# City    False
# dtype: bool

df.fillna(0, inplace=True)  # Fill missing values with 0
#       Name  Age         City
# 0    Alice   25     New York
# 1      Bob   30  Los Angeles
# 2  Charlie   35      Chicago

df.dropna(inplace=True)  # Drop rows with any missing values
#      Name  Age         City
# 0    Alice   25     New York
# 1      Bob   30  Los Angeles
# 2  Charlie   35      Chicago

print(df.iloc[0])  # Access the first row by index
# Name      Alice
# Age          25
# City    New York
# Name: 0, dtype: object

df.sort_values("Age", inplace=True)  # Sort DataFrame by 'Age' column
#       Name  Age         City
# 0    Alice   25     New York
# 1      Bob   30  Los Angeles
# 2  Charlie   35      Chicago

print(
    df["City"].value_counts()
)  # Count occurrences of each unique value in 'City' column
# New York       1
# Los Angeles    1
# Chicago        1
# Name: City, dtype: int64

grouped = df.groupby("City").mean(
    numeric_only=True
)  # Group by 'City' and calculate mean of numerical columns
#                   Age
# City
# Chicago          35.0
# Los Angeles      30.0
# New York         25.0

df["Age"] = df["Age"].apply(
    lambda x: x * 2
)  # Apply a function to double the 'Age' values
#       Name  Age         City
# 0    Alice   50     New York
# 1      Bob   60  Los Angeles
# 2  Charlie   70      Chicago

merged_df = pd.merge(
    df, df2, on="Name", how="outer"
)  # Merge two DataFrames on 'Name' column
#       Name  Age         City
# 0    Alice   50     New York
# 1      Bob   60  Los Angeles
# 2  Charlie   70      Chicago
# 3    David   28        Miami
# 4      Eva   22      Seattle

concatenated_df = pd.concat([df, df2])  # Concatenate two DataFrames
#       Name  Age         City
# 0    Alice   50     New York
# 1      Bob   60  Los Angeles
# 2  Charlie   70      Chicago
# 0    David   28        Miami
# 1      Eva   22      Seattle

df.rename(
    columns={"Name": "Full Name"}, inplace=True
)  # Rename 'Name' column to 'Full Name'
#      Full Name  Age         City
# 0    Alice      50     New York
# 1      Bob      60  Los Angeles
# 2  Charlie      70      Chicago
# 3    David      28        Miami
# 4      Eva      22      Seattle

df.drop("City", axis=1, inplace=True)  # Drop the 'City' column
#      Full Name  Age
# 0    Alice      50
# 1      Bob      60
# 2  Charlie      70
# 3    David      28
# 4      Eva      22


      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
 2   City    3 non-null      object
dtypes: int64(1), object(2)
memory usage: 204.0+ bytes
None
        Age
count   3.0
mean   30.0
std     5.0
min    25.0
25%    27.5
50%    30.0
75%    32.5
max    35.0
(3, 3)
Name    object
Age      int64
City    object
dtype: object
['Alice' 'Bob' 'Charlie']
    Name    Age   City
0  False  False  False
1  False  False  False
2  False  False  False
Name       Alice
Age           25
City    New York
Name: 0, dtype: object
City
New York       1
Los Angeles    1
Chicago        1
Name: 