In [1]:
import pandas as pd
import numpy as np

In [2]:
# Creating a DataFrame from NumPy arrays
data = np.array([[25, 30, 22], ['John', 'Alice', 'Bob']])
df = pd.DataFrame(data.T, columns=['Age', 'Name'])

In [3]:
# . Selecting Data:
#      You can select columns, rows, or specific values using different methods:

   # 1. Column Selection:
print(df['Name'])  # Selects a column (as Series)


0     John
1    Alice
2      Bob
Name: Name, dtype: object


In [4]:
#   Row Selection by Index
print(df.loc[1])  # Select row with index 1 (label-based)
print(df.iloc[1])  # Select row with index 1 (position-based)


Age        30
Name    Alice
Name: 1, dtype: object
Age        30
Name    Alice
Name: 1, dtype: object


1. .loc - Label-Based Indexing

In [None]:
#            1. .loc - Label-Based Indexing
# .loc is used to access data based on labels (the index or column names) of the DataFrame.
# It allows access to both rows and columns by their labels, making it flexible for filtering based on known names.
# Slicing and boolean arrays can also be used with .loc.

data = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [24, 27, 22],
    "City": ["New York", "Los Angeles", "Chicago"]
}
df = pd.DataFrame(data, index=["a", "b", "c"])

# Accessing a specific row by label
row_b = df.loc["b"]

# Accessing a specific element by row and column labels
age_bob = df.loc["b", "Age"]

# Accessing multiple rows and columns
subset = df.loc[["a", "c"], ["Name", "City"]]

print(row_b)
print(age_bob)
print(subset)


2. .iloc - Integer-Based Indexing

In [None]:
# 2. .iloc - Integer-Based Indexing
# .iloc is used to access data by integer-based positions (row and column indices).
#It requires integer indices instead of labels, making it especially useful when the DataFrame doesn’t have meaningful labels.
# Similar to .loc, .iloc allows you to specify slices, ranges, and lists of positions.

# Accessing a specific row by integer index
second_row = df.iloc[1]

# Accessing a specific element by row and column indices
age_second_row = df.iloc[1, 1]

# Accessing multiple rows and columns
subset = df.iloc[[0, 2], [0, 2]]

print(second_row)
print(age_second_row)
print(subset)


3. .at - Fast Label-Based Scalar Access

In [None]:
# 3. .at - Fast Label-Based Scalar Access
# .at is designed for fast access to a single scalar value (one cell) based on row and column labels.
# It’s more efficient than .loc when accessing a single element because it’s optimized for scalar access

# Accessing a single element by label (faster than .loc for one cell)
age_bob = df.at["b", "Age"]
print(age_bob)


4. .iat - Fast Integer-Based Scalar Access

In [None]:
# 4. .iat - Fast Integer-Based Scalar Access
# .iat is similar to .at, but it uses integer-based positions to quickly access a single scalar value.
# Like .i

# Accessing a single element by integer position (faster than .iloc for one cell)
age_second_row = df.iat[1, 1]
print(age_second_row)


In [None]:
#     Selecting Specific values
print(df.loc[1, 'Age'])  # Select value from row 1 and column 'Age'


In [None]:
# 2. Filtering:
#       You can filter rows based on conditions:

# Filter rows where Age is greater than 23
filtered_df = df[df['Age'] > 23]
print(filtered_df)


In [None]:
# 3. Adding or Removing Data:
#      Adding a New Column:

df['Country'] = ['USA', 'France', 'UK']
print(df)


In [None]:
        # Removing a Column
df.drop('City', axis=1, inplace=True)  # Removes the 'City' column


In [None]:
# Adding a Row

new_row = {'Name': 'Eve', 'Age': 28, 'City': 'Berlin'}
df = df.append(new_row, ignore_index=True)
print(df)


In [None]:
# Removing a Row

df.drop(0, axis=0, inplace=True)  # Removes the first row


In [5]:
# 4. Handling Missing Data:
#       DataFrames handle missing data (NaN) effectively with built-in functions:

#     Checking for Missing Data:
print(df.isnull())  # Shows True where values are missing


     Age   Name
0  False  False
1  False  False
2  False  False


In [None]:
# Filling Missing Data:

df['Age'].fillna(df['Age'].mean(), inplace=True)  # Fills missing 'Age' with the mean


In [None]:
# Dropping Rows or Columns with Missing Data:

df.dropna(inplace=True)  # Drops any row with missing data

In [None]:
# 5. Aggregating and Summarizing Data:
#        DataFrames make it easy to compute summaries and perform group-based operations:

# Summary Statistics:

print(df.describe())  # Generates summary statistics for numeric columns


In [None]:
# Grouping and Aggregating:

grouped = df.groupby('City')['Age'].mean()  # Groups by 'City' and calculates mean 'Age'
print(grouped)

In [None]:
# 6. Merging, Joining, and Concatenating DataFrames:
#          Merging two DataFrames (similar to SQL joins):

df1 = pd.DataFrame({'ID': [1, 2], 'Name': ['Alice', 'Bob']})
df2 = pd.DataFrame({'ID': [1, 3], 'Score': [85, 90]})
merged_df = pd.merge(df1, df2, on='ID', how='inner')  # Inner join
print(merged_df)

In [None]:
#   Concatenating DataFrames:

concatenated_df = pd.concat([df1, df2], axis=0)  # Concatenates row-wise
print(concatenated_df)

# Example of a DataFrame in Action

In [11]:
# Sample DataFrame with student information
data = {
    'Names': ['John', 'Alice', 'Bob', 'Eve'],
    'Math': [85, 95, 78, 92],
    'Science': [88, 79, 92, 94],
    'English': [91, 83, 89, 87]
}

stu_marks = pd.DataFrame(data, index=np.arange(1,5))
                        
print(stu_marks)


   Names  Math  Science  English
1   John    85       88       91
2  Alice    95       79       83
3    Bob    78       92       89
4    Eve    92       94       87


In [13]:
# Compute the average score for each student
stu_marks['Average'] = stu_marks[['Math', 'Science', 'English']].mean(axis=1).round(2)
print(stu_marks)

   Names  Math  Science  English  Average
1   John    85       88       91    88.00
2  Alice    95       79       83    85.67
3    Bob    78       92       89    86.33
4    Eve    92       94       87    91.00


In [None]:
# Filter students with an average score above 90
high_achievers = stu_marks[stu_marks['Average'] > 90]
print(high_achievers)