In [2]:
# 1. Importing Required Libraries
import numpy as np
import pandas as pd

In [3]:

#2 Creating two new arrays with different variable names
numbers_a = np.array([1, 2, 3, 4, 5])
numbers_b = np.array([10, 20, 30, 40, 50])

# Displaying the arrays
print("Numbers A:", numbers_a)
print("Numbers B:", numbers_b)

# Element-wise operations
sum_array = np.add(numbers_a, numbers_b)
product_array = np.multiply(numbers_a, numbers_b)
dot_result = np.dot(numbers_a, numbers_b)

# Displaying results
print("Element-wise Sum:", sum_array)
print("Element-wise Product:", product_array)
print("Dot Product:", dot_result)

# Additional Information
print("Shape of Numbers A:", numbers_a.shape)
print("Shape of Numbers B:", numbers_b.shape)
print("Data Type of Numbers A:", numbers_a.dtype)
print("Data Type of Numbers B:", numbers_b.dtype)

# Exploring additional features
print("Max in A:", np.max(numbers_a))
print("Min in B:", np.min(numbers_b))
print("Mean of A:", np.mean(numbers_a))


Numbers A: [1 2 3 4 5]
Numbers B: [10 20 30 40 50]
Element-wise Sum: [11 22 33 44 55]
Element-wise Product: [ 10  40  90 160 250]
Dot Product: 550
Shape of Numbers A: (5,)
Shape of Numbers B: (5,)
Data Type of Numbers A: int64
Data Type of Numbers B: int64
Max in A: 5
Min in B: 10
Mean of A: 3.0


In [5]:
#3 Creating a 2D array (matrix) and a 1D array (row vector)
array_2d = np.array([[1, 2, 3], [4, 5, 6]])
row_vector = np.array([10, 20, 30])

# Broadcasting: Adding the row vector to each row of the 2D array
result = array_2d + row_vector

# Displaying the arrays and result
print("Original 2D Array:\n", array_2d)
print("Row Vector:", row_vector)
print("Result after Broadcasting (Addition):\n", result)

# Exploring dimensions for better understanding
print("Shape of 2D Array:", array_2d.shape)
print("Shape of Row Vector:", row_vector.shape)
print("Result Shape:", result.shape)


Original 2D Array:
 [[1 2 3]
 [4 5 6]]
Row Vector: [10 20 30]
Result after Broadcasting (Addition):
 [[11 22 33]
 [14 25 36]]
Shape of 2D Array: (2, 3)
Shape of Row Vector: (3,)
Result Shape: (2, 3)


In [None]:
# Sample employee records with some missing data
employee_info = {
    "EmpID": [201, 202, 203, 204, 205, 206, 207],
    "FullName": ["Amit", "Bhavna", "Chetan", "Deepa", "Esha", None, "Farhan"],
    "Age": [26, 31, None, 44, 29, 36, 41],
    "Dept": ["Admin", "Tech", "Tech", "Finance", None, "Admin", "Tech"],
    "MonthlySalary": [52000, 63000, 57000, None, 49000, 51000, 62000],
    "JoinDate": pd.to_datetime([
        "2019-02-10", "2020-08-25", "2021-03-14",
        "2016-10-05", "2022-07-01", "2019-11-17", "2020-01-30"
    ])
}

# Creating DataFrame
df_emp = pd.DataFrame(employee_info)

# Display the DataFrame
print(" Sample Employee DataFrame with Missing Values:\n")
print(df_emp)

# Basic info about the DataFrame
print("\n DataFrame Summary:")
print(df_emp.info())


🔍 Sample Employee DataFrame with Missing Values:

   EmpID FullName   Age     Dept  MonthlySalary   JoinDate
0    201     Amit  26.0    Admin        52000.0 2019-02-10
1    202   Bhavna  31.0     Tech        63000.0 2020-08-25
2    203   Chetan   NaN     Tech        57000.0 2021-03-14
3    204    Deepa  44.0  Finance            NaN 2016-10-05
4    205     Esha  29.0     None        49000.0 2022-07-01
5    206     None  36.0    Admin        51000.0 2019-11-17
6    207   Farhan  41.0     Tech        62000.0 2020-01-30

📊 DataFrame Summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   EmpID          7 non-null      int64         
 1   FullName       6 non-null      object        
 2   Age            6 non-null      float64       
 3   Dept           6 non-null      object        
 4   MonthlySalary  6 non-null      float64       

In [None]:
#Data Cleaning – Handling Missing Values

# Step 1: Remove rows where FullName or Dept is missing (important fields)
df_filtered = df_emp.dropna(subset=["FullName", "Dept"])

# Step 2: Fill missing Age and MonthlySalary with column-wise mean
df_filtered.loc[:, "Age"] = df_filtered["Age"].fillna(df_filtered["Age"].mean())
df_filtered.loc[:, "MonthlySalary"] = df_filtered["MonthlySalary"].fillna(df_filtered["MonthlySalary"].mean())

# Display cleaned DataFrame
print("\n Cleaned Employee DataFrame (after handling missing values):\n")
print(df_filtered)



✅ Cleaned Employee DataFrame (after handling missing values):

   EmpID FullName   Age     Dept  MonthlySalary   JoinDate
0    201     Amit  26.0    Admin        52000.0 2019-02-10
1    202   Bhavna  31.0     Tech        63000.0 2020-08-25
2    203   Chetan  35.5     Tech        57000.0 2021-03-14
3    204    Deepa  44.0  Finance        58500.0 2016-10-05
6    207   Farhan  41.0     Tech        62000.0 2020-01-30


In [8]:
#  Step: Add a duplicate row intentionally for testing
df_filtered = pd.concat([df_filtered, df_filtered.iloc[0:1]], ignore_index=True)

print("\n DataFrame with a duplicate row added:\n")
print(df_filtered)

#  Step: Remove duplicate rows based on all columns
df_filtered = df_filtered.drop_duplicates()

print("\n DataFrame after removing duplicate rows:\n")
print(df_filtered)



 DataFrame with a duplicate row added:

   EmpID FullName   Age     Dept  MonthlySalary   JoinDate
0    201     Amit  26.0    Admin        52000.0 2019-02-10
1    202   Bhavna  31.0     Tech        63000.0 2020-08-25
2    203   Chetan  35.5     Tech        57000.0 2021-03-14
3    204    Deepa  44.0  Finance        58500.0 2016-10-05
4    207   Farhan  41.0     Tech        62000.0 2020-01-30
5    201     Amit  26.0    Admin        52000.0 2019-02-10

 DataFrame after removing duplicate rows:

   EmpID FullName   Age     Dept  MonthlySalary   JoinDate
0    201     Amit  26.0    Admin        52000.0 2019-02-10
1    202   Bhavna  31.0     Tech        63000.0 2020-08-25
2    203   Chetan  35.5     Tech        57000.0 2021-03-14
3    204    Deepa  44.0  Finance        58500.0 2016-10-05
4    207   Farhan  41.0     Tech        62000.0 2020-01-30


In [9]:
#  Advanced Indexing and Filtering

# 1️ Select employees from the 'Tech' department only
tech_team = df_filtered[df_filtered["Dept"] == "Tech"]
print("\n Employees in Tech Department:\n")
print(tech_team)

# 2️⃣ Select employees with MonthlySalary greater than 55,000
high_salary_employees = df_filtered[df_filtered["MonthlySalary"] > 55000]
print("\n Employees with MonthlySalary > 55,000:\n")
print(high_salary_employees)



 Employees in Tech Department:

   EmpID FullName   Age  Dept  MonthlySalary   JoinDate
1    202   Bhavna  31.0  Tech        63000.0 2020-08-25
2    203   Chetan  35.5  Tech        57000.0 2021-03-14
4    207   Farhan  41.0  Tech        62000.0 2020-01-30

 Employees with MonthlySalary > 55,000:

   EmpID FullName   Age     Dept  MonthlySalary   JoinDate
1    202   Bhavna  31.0     Tech        63000.0 2020-08-25
2    203   Chetan  35.5     Tech        57000.0 2021-03-14
3    204    Deepa  44.0  Finance        58500.0 2016-10-05
4    207   Farhan  41.0     Tech        62000.0 2020-01-30


In [10]:
#  Sorting and Selecting Specific Columns

# 1️ Sort the DataFrame by MonthlySalary in descending order
df_sorted = df_filtered.sort_values(by="MonthlySalary", ascending=False)
print("\n DataFrame sorted by MonthlySalary (High to Low):\n")
print(df_sorted)

# 2️ Select only FullName and MonthlySalary columns
salary_info = df_sorted[["FullName", "MonthlySalary"]]
print("\n Selected Columns (FullName & MonthlySalary):\n")
print(salary_info)



 DataFrame sorted by MonthlySalary (High to Low):

   EmpID FullName   Age     Dept  MonthlySalary   JoinDate
1    202   Bhavna  31.0     Tech        63000.0 2020-08-25
4    207   Farhan  41.0     Tech        62000.0 2020-01-30
3    204    Deepa  44.0  Finance        58500.0 2016-10-05
2    203   Chetan  35.5     Tech        57000.0 2021-03-14
0    201     Amit  26.0    Admin        52000.0 2019-02-10

 Selected Columns (FullName & MonthlySalary):

  FullName  MonthlySalary
1   Bhavna        63000.0
4   Farhan        62000.0
3    Deepa        58500.0
2   Chetan        57000.0
0     Amit        52000.0


In [11]:
# Grouping Data for Aggregation

# 1️ Calculate average MonthlySalary by Dept
avg_salary_by_dept = df_filtered.groupby("Dept")["MonthlySalary"].mean().reset_index()
print("\n Average Monthly Salary by Department:\n")
print(avg_salary_by_dept)

# 2️ Calculate employee count and average age per department
dept_summary = df_filtered.groupby("Dept").agg({
    "EmpID": "count",
    "Age": "mean"
}).rename(columns={
    "EmpID": "EmployeeCount",
    "Age": "AverageAge"
})

print("\n Department-wise Employee Count and Average Age:\n")
print(dept_summary)



 Average Monthly Salary by Department:

      Dept  MonthlySalary
0    Admin   52000.000000
1  Finance   58500.000000
2     Tech   60666.666667

 Department-wise Employee Count and Average Age:

         EmployeeCount  AverageAge
Dept                              
Admin                1   26.000000
Finance              1   44.000000
Tech                 3   35.833333


In [12]:
# Define a function to categorize employees by age group
def get_age_group(age):
    """
    Categorizes employees into age groups:
    - 'Young' if age < 30
    - 'Mid-age' if 30 <= age < 40
    - 'Senior' if age >= 40
    """
    if age < 30:
        return "Young"
    elif age < 40:
        return "Mid-age"
    else:
        return "Senior"

#  Apply function to create a new column 'AgeGroup'
df_filtered.loc[:, "AgeGroup"] = df_filtered["Age"].apply(get_age_group)

print("\n DataFrame with new 'AgeGroup' column:\n")
print(df_filtered)



 DataFrame with new 'AgeGroup' column:

   EmpID FullName   Age     Dept  MonthlySalary   JoinDate AgeGroup
0    201     Amit  26.0    Admin        52000.0 2019-02-10    Young
1    202   Bhavna  31.0     Tech        63000.0 2020-08-25  Mid-age
2    203   Chetan  35.5     Tech        57000.0 2021-03-14  Mid-age
3    204    Deepa  44.0  Finance        58500.0 2016-10-05   Senior
4    207   Farhan  41.0     Tech        62000.0 2020-01-30   Senior


In [13]:
#  Aggregating by New 'AgeGroup' Column

# 1️ Calculate average MonthlySalary by AgeGroup
avg_salary_by_agegroup = df_filtered.groupby("AgeGroup")["MonthlySalary"].mean().reset_index()

print("\n Average MonthlySalary by AgeGroup:\n")
print(avg_salary_by_agegroup)



 Average MonthlySalary by AgeGroup:

  AgeGroup  MonthlySalary
0  Mid-age        60000.0
1   Senior        60250.0
2    Young        52000.0


In [14]:
# Handling Date Columns and Filtering by Date

# Filter employees who joined after January 1, 2019
recent_joiners = df_filtered[df_filtered["JoinDate"] > "2019-01-01"]

print("\n Employees who joined after 2019-01-01:\n")
print(recent_joiners)



 Employees who joined after 2019-01-01:

   EmpID FullName   Age   Dept  MonthlySalary   JoinDate AgeGroup
0    201     Amit  26.0  Admin        52000.0 2019-02-10    Young
1    202   Bhavna  31.0   Tech        63000.0 2020-08-25  Mid-age
2    203   Chetan  35.5   Tech        57000.0 2021-03-14  Mid-age
4    207   Farhan  41.0   Tech        62000.0 2020-01-30   Senior
