#### Data Manipulation

In [50]:
import pandas as pd
import numpy as np

###### 1. Fundamentals: Series and DataFrame

In [5]:
# Creating a Series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print("Series:\n", s)

Series:
 0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [9]:
# Creating a DataFrame
data = {'column1': [1, 2, 3, 4],
        'column2': ['A', 'B', 'C', 'D'],
        'column3': [True, False, True, False]}
df = pd.DataFrame(data)
print("\nDataFrame:\n", df)


DataFrame:
    column1 column2  column3
0        1       A     True
1        2       B    False
2        3       C     True
3        4       D    False


###### 2. Basic Data Selection and Indexing

In [19]:
# Select a single column
print("\nSelect 'column1': \n", df['column1'])


Select 'column1': 
 0    1
1    2
2    3
3    4
Name: column1, dtype: int64


In [None]:
# Select multiple columns
print("\nSelect 'column2': \n", df['column1'])

*******************************************

In [12]:
# Create a sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Alice'],
    'Age': [24, 27, 22, 32, 29, 25, 24],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami', 'New York', 'Chicago'],
    'Salary': [70000, 85000, 60000, 120000, 95000, 72000, 62000],
    'Department': ['HR', 'IT', 'Finance', 'IT', 'HR', 'Finance', 'HR'],
    'Experience': [2, 5, 1, 10, 6, 3, 2],
    'Rating': [4.5, 3.8, np.nan, 4.9, 4.2, 3.5, 4.0] # np.nan for missing values
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print("\n" + "="*50 + "\n")

Original DataFrame:
      Name  Age         City  Salary Department  Experience  Rating
0    Alice   24     New York   70000         HR           2     4.5
1      Bob   27  Los Angeles   85000         IT           5     3.8
2  Charlie   22      Chicago   60000    Finance           1     NaN
3    David   32      Houston  120000         IT          10     4.9
4      Eve   29        Miami   95000         HR           6     4.2
5    Frank   25     New York   72000    Finance           3     3.5
6    Alice   24      Chicago   62000         HR           2     4.0




In [14]:
df

Unnamed: 0,Name,Age,City,Salary,Department,Experience,Rating
0,Alice,24,New York,70000,HR,2,4.5
1,Bob,27,Los Angeles,85000,IT,5,3.8
2,Charlie,22,Chicago,60000,Finance,1,
3,David,32,Houston,120000,IT,10,4.9
4,Eve,29,Miami,95000,HR,6,4.2
5,Frank,25,New York,72000,Finance,3,3.5
6,Alice,24,Chicago,62000,HR,2,4.0


##### 1. Conditional Selection (Filtering Rows): This is one of the most common and powerful techniques.

In [25]:
# Filter rows where Age is greater than 25
print("Employees older than 25:")
print(df[df['Age'] > 25])
print("\n" + "-"*30 + "\n")

Employees older than 25:
    Name  Age         City  Salary Department  Experience  Rating
1    Bob   27  Los Angeles   85000         IT           5     3.8
3  David   32      Houston  120000         IT          10     4.9
4    Eve   29        Miami   95000         HR           6     4.2

------------------------------



In [18]:
# Filter rows with multiple conditions (using & for AND, | for OR)
print("Employees from New York with Salary > 70000:")
print(df[(df['City'] == 'New York') & (df['Salary'] > 70000)])
print("\n" + "-"*30 + "\n")

Employees from New York with Salary > 70000:
    Name  Age      City  Salary Department  Experience  Rating
5  Frank   25  New York   72000    Finance           3     3.5

------------------------------



In [22]:
# Using .isin() for multiple values in a column
print("Employees from New York or Los Angeles:")
print(df[df['City'].isin(['New York', 'Los Angeles'])])
print("\n" + "="*50 + "\n")

Employees from New York or Los Angeles:
    Name  Age         City  Salary Department  Experience  Rating
0  Alice   24     New York   70000         HR           2     4.5
1    Bob   27  Los Angeles   85000         IT           5     3.8
5  Frank   25     New York   72000    Finance           3     3.5




##### 2. Grouping and Aggregating Data


In [36]:
# Group by 'Department' and calculate the mean 'Salary'
print("Average Salary by Department:")
print(df.groupby('Department')['Salary'].mean())
print("\n" + "-"*30 + "\n")

Average Salary by Department:
Department
Finance     66000.000000
HR          75666.666667
IT         102500.000000
Name: Salary, dtype: float64

------------------------------



In [46]:
# Group by multiple columns and get multiple aggregations
print("Aggregations by Department and City:")
print(df.groupby(['Department', 'City']).agg(Avg_Salary=('Salary', 'mean'), Min_Age=('Age', 'min'), Max_Experience=('Experience', 'max'), Count=('Name', 'count')
))
print("\n" + "-"*30 + "\n")

Aggregations by Department and City:
                        Avg_Salary  Min_Age  Max_Experience  Count
Department City                                                   
Finance    Chicago         60000.0       22               1      1
           New York        72000.0       25               3      1
HR         Chicago         62000.0       24               2      1
           Miami           95000.0       29               6      1
           New York        70000.0       24               2      1
IT         Houston        120000.0       32              10      1
           Los Angeles     85000.0       27               5      1

------------------------------



In [48]:
# Apply multiple aggregation functions to a single column
print("Salary statistics by Department:")
print(df.groupby('Department')['Salary'].agg(['mean', 'median', 'min', 'max', 'std']))
print("\n" + "="*50 + "\n")

Salary statistics by Department:
                     mean    median    min     max           std
Department                                                      
Finance      66000.000000   66000.0  60000   72000   8485.281374
HR           75666.666667   70000.0  62000   95000  17214.335112
IT          102500.000000  102500.0  85000  120000  24748.737342




##### Merging/Joining (SQL-like Joins)

In [53]:
# Create another DataFrame 
dep_info = pd.DataFrame({
    'Department': ['HR', 'IT', 'Finance'],
    'Head': ['John Doe', 'Jane Smith', 'Peter Jones'],
    'Budget': [500000, 800000, 600000]
})
print("Original DataFrame:")
print(df[['Name', 'Department', 'Salary']])
print("\nDepartment Info DataFrame:")
print(dep_info)
print("\n" + "-"*30 + "\n")

Original DataFrame:
      Name Department  Salary
0    Alice         HR   70000
1      Bob         IT   85000
2  Charlie    Finance   60000
3    David         IT  120000
4      Eve         HR   95000
5    Frank    Finance   72000
6    Alice         HR   62000

Department Info DataFrame:
  Department         Head  Budget
0         HR     John Doe  500000
1         IT   Jane Smith  800000
2    Finance  Peter Jones  600000

------------------------------



In [57]:
# Inner Merge (default): Only rows with matching 'Department' in both DFs
merged_df_inner = pd.merge(df, dep_info, on='Department', how='inner')
print("Inner Merged DataFrame:")
print(merged_df_inner)
print("\n" + "-"*30 + "\n")


Inner Merged DataFrame:
      Name  Age         City  Salary Department  Experience  Rating  \
0    Alice   24     New York   70000         HR           2     4.5   
1      Bob   27  Los Angeles   85000         IT           5     3.8   
2  Charlie   22      Chicago   60000    Finance           1     NaN   
3    David   32      Houston  120000         IT          10     4.9   
4      Eve   29        Miami   95000         HR           6     4.2   
5    Frank   25     New York   72000    Finance           3     3.5   
6    Alice   24      Chicago   62000         HR           2     4.0   

          Head  Budget  
0     John Doe  500000  
1   Jane Smith  800000  
2  Peter Jones  600000  
3   Jane Smith  800000  
4     John Doe  500000  
5  Peter Jones  600000  
6     John Doe  500000  

------------------------------



In [59]:
merged_df_inner

Unnamed: 0,Name,Age,City,Salary,Department,Experience,Rating,Head,Budget
0,Alice,24,New York,70000,HR,2,4.5,John Doe,500000
1,Bob,27,Los Angeles,85000,IT,5,3.8,Jane Smith,800000
2,Charlie,22,Chicago,60000,Finance,1,,Peter Jones,600000
3,David,32,Houston,120000,IT,10,4.9,Jane Smith,800000
4,Eve,29,Miami,95000,HR,6,4.2,John Doe,500000
5,Frank,25,New York,72000,Finance,3,3.5,Peter Jones,600000
6,Alice,24,Chicago,62000,HR,2,4.0,John Doe,500000


In [67]:
# Left Merge: Keep all rows from the left DataFrame (df), fill NaNs for non-matches
merged_df_left = pd.merge(df, dep_info, on='Department', how='left')
print("Left Merged DataFrame:")
print(merged_df_left)
print("\n" + "-"*30 + "\n")

Left Merged DataFrame:
      Name  Age         City  Salary Department  Experience  Rating  \
0    Alice   24     New York   70000         HR           2     4.5   
1      Bob   27  Los Angeles   85000         IT           5     3.8   
2  Charlie   22      Chicago   60000    Finance           1     NaN   
3    David   32      Houston  120000         IT          10     4.9   
4      Eve   29        Miami   95000         HR           6     4.2   
5    Frank   25     New York   72000    Finance           3     3.5   
6    Alice   24      Chicago   62000         HR           2     4.0   

          Head  Budget  
0     John Doe  500000  
1   Jane Smith  800000  
2  Peter Jones  600000  
3   Jane Smith  800000  
4     John Doe  500000  
5  Peter Jones  600000  
6     John Doe  500000  

------------------------------



In [69]:
merged_df_left

Unnamed: 0,Name,Age,City,Salary,Department,Experience,Rating,Head,Budget
0,Alice,24,New York,70000,HR,2,4.5,John Doe,500000
1,Bob,27,Los Angeles,85000,IT,5,3.8,Jane Smith,800000
2,Charlie,22,Chicago,60000,Finance,1,,Peter Jones,600000
3,David,32,Houston,120000,IT,10,4.9,Jane Smith,800000
4,Eve,29,Miami,95000,HR,6,4.2,John Doe,500000
5,Frank,25,New York,72000,Finance,3,3.5,Peter Jones,600000
6,Alice,24,Chicago,62000,HR,2,4.0,John Doe,500000


In [71]:
# Right Merge: Keep all rows from the right DataFrame (department_info)
merged_df_right = pd.merge(df, dep_info, on='Department', how='right')
print("Right Merged DataFrame:")
print(merged_df_right)
print("\n" + "-"*30 + "\n")

Right Merged DataFrame:
      Name  Age         City  Salary Department  Experience  Rating  \
0    Alice   24     New York   70000         HR           2     4.5   
1      Eve   29        Miami   95000         HR           6     4.2   
2    Alice   24      Chicago   62000         HR           2     4.0   
3      Bob   27  Los Angeles   85000         IT           5     3.8   
4    David   32      Houston  120000         IT          10     4.9   
5  Charlie   22      Chicago   60000    Finance           1     NaN   
6    Frank   25     New York   72000    Finance           3     3.5   

          Head  Budget  
0     John Doe  500000  
1     John Doe  500000  
2     John Doe  500000  
3   Jane Smith  800000  
4   Jane Smith  800000  
5  Peter Jones  600000  
6  Peter Jones  600000  

------------------------------



In [73]:
merged_df_right

Unnamed: 0,Name,Age,City,Salary,Department,Experience,Rating,Head,Budget
0,Alice,24,New York,70000,HR,2,4.5,John Doe,500000
1,Eve,29,Miami,95000,HR,6,4.2,John Doe,500000
2,Alice,24,Chicago,62000,HR,2,4.0,John Doe,500000
3,Bob,27,Los Angeles,85000,IT,5,3.8,Jane Smith,800000
4,David,32,Houston,120000,IT,10,4.9,Jane Smith,800000
5,Charlie,22,Chicago,60000,Finance,1,,Peter Jones,600000
6,Frank,25,New York,72000,Finance,3,3.5,Peter Jones,600000


In [77]:
# Outer Merge: Keep all rows from both DataFrames, fill NaNs for non-matches
merged_df_outer = pd.merge(df, dep_info, on='Department', how='outer')
print("Outer Merged DataFrame:")
print(merged_df_outer)
print("\n" + "="*50 + "\n")

Outer Merged DataFrame:
      Name  Age         City  Salary Department  Experience  Rating  \
0  Charlie   22      Chicago   60000    Finance           1     NaN   
1    Frank   25     New York   72000    Finance           3     3.5   
2    Alice   24     New York   70000         HR           2     4.5   
3      Eve   29        Miami   95000         HR           6     4.2   
4    Alice   24      Chicago   62000         HR           2     4.0   
5      Bob   27  Los Angeles   85000         IT           5     3.8   
6    David   32      Houston  120000         IT          10     4.9   

          Head  Budget  
0  Peter Jones  600000  
1  Peter Jones  600000  
2     John Doe  500000  
3     John Doe  500000  
4     John Doe  500000  
5   Jane Smith  800000  
6   Jane Smith  800000  




In [79]:
merged_df_outer

Unnamed: 0,Name,Age,City,Salary,Department,Experience,Rating,Head,Budget
0,Charlie,22,Chicago,60000,Finance,1,,Peter Jones,600000
1,Frank,25,New York,72000,Finance,3,3.5,Peter Jones,600000
2,Alice,24,New York,70000,HR,2,4.5,John Doe,500000
3,Eve,29,Miami,95000,HR,6,4.2,John Doe,500000
4,Alice,24,Chicago,62000,HR,2,4.0,John Doe,500000
5,Bob,27,Los Angeles,85000,IT,5,3.8,Jane Smith,800000
6,David,32,Houston,120000,IT,10,4.9,Jane Smith,800000
