# Step 1: Reading CSV Files and Manipulating Data Frames
## Reading a CSV File

In [5]:
import pandas as pd

# Reading a CSV file
df = pd.read_csv(r'C:\Users\Owner\Downloads\sample_data.csv')

# Displaying the first few rows of the dataframe
print("Initial DataFrame:\n", df.head())



Initial DataFrame:
       name  age   department  salary
0    Alice   25           HR   50000
1      Bob   32  Engineering   60000
2  Charlie   22    Marketing   45000
3    David   45  Engineering   80000
4      Eva   28           HR   52000


# Step 2: Performing Simple Data Cleaning Tasks
## Handling Missing Values

In [6]:
import pandas as pd

# Sample data with missing values
data_with_missing_values = {
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'age': [25, None, 22, 45, 28],
    'department': ['HR', 'Engineering', None, 'Engineering', 'HR'],
    'salary': [50000, 60000, 45000, None, 52000]
}

df = pd.DataFrame(data_with_missing_values)

# Checking for missing values
print("Missing values before handling:\n", df.isnull().sum())

# Filling missing values with a specific value (e.g., 0)
df.fillna(0, inplace=True)

# Displaying the dataframe after handling missing values
print("DataFrame after filling missing values:\n", df)


Missing values before handling:
 name          0
age           1
department    1
salary        1
dtype: int64
DataFrame after filling missing values:
       name   age   department   salary
0    Alice  25.0           HR  50000.0
1      Bob   0.0  Engineering  60000.0
2  Charlie  22.0            0  45000.0
3    David  45.0  Engineering      0.0
4      Eva  28.0           HR  52000.0


## Removing Duplicates

In [7]:
import pandas as pd

# Sample data with duplicates
data_with_duplicates = {
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Alice'],
    'age': [25, 32, 22, 45, 28, 25],
    'department': ['HR', 'Engineering', 'Marketing', 'Engineering', 'HR', 'HR'],
    'salary': [50000, 60000, 45000, 80000, 52000, 50000]
}

df = pd.DataFrame(data_with_duplicates)

# Checking for duplicates
print("Number of duplicate rows before removal:", df.duplicated().sum())

# Removing duplicate rows
df.drop_duplicates(inplace=True)

# Displaying the dataframe after removing duplicates
print("DataFrame after removing duplicates:\n", df)


Number of duplicate rows before removal: 1
DataFrame after removing duplicates:
       name  age   department  salary
0    Alice   25           HR   50000
1      Bob   32  Engineering   60000
2  Charlie   22    Marketing   45000
3    David   45  Engineering   80000
4      Eva   28           HR   52000


# Step 3: Basic Data Manipulation Operations
## Filtering Data

In [8]:
import pandas as pd

# Sample data
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'age': [25, 32, 22, 45, 28],
    'department': ['HR', 'Engineering', 'Marketing', 'Engineering', 'HR'],
    'salary': [50000, 60000, 45000, 80000, 52000]
}

df = pd.DataFrame(data)

# Filtering rows where 'age' is greater than 30
filtered_df = df[df['age'] > 30]

# Displaying the filtered dataframe
print("Filtered DataFrame:\n", filtered_df)


Filtered DataFrame:
     name  age   department  salary
1    Bob   32  Engineering   60000
3  David   45  Engineering   80000


## Sorting Data

In [9]:
import pandas as pd

# Sample data
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'age': [25, 32, 22, 45, 28],
    'department': ['HR', 'Engineering', 'Marketing', 'Engineering', 'HR'],
    'salary': [50000, 60000, 45000, 80000, 52000]
}

df = pd.DataFrame(data)

# Sorting data by 'salary' in descending order
sorted_df = df.sort_values(by='salary', ascending=False)

# Displaying the sorted dataframe
print("Sorted DataFrame:\n", sorted_df)


Sorted DataFrame:
       name  age   department  salary
3    David   45  Engineering   80000
1      Bob   32  Engineering   60000
4      Eva   28           HR   52000
0    Alice   25           HR   50000
2  Charlie   22    Marketing   45000


## Grouping Data

In [11]:
import pandas as pd

# Sample data
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'age': [25, 32, 22, 45, 28],
    'department': ['HR', 'Engineering', 'Marketing', 'Engineering', 'HR'],
    'salary': [50000, 60000, 45000, 80000, 52000]
}

df = pd.DataFrame(data)

# Grouping data by 'department' and calculating the mean salary
grouped_df = df.groupby('department')['salary'].mean()

# Displaying the grouped dataframe
print("Grouped DataFrame (Mean Salary by Department):\n", grouped_df)


Grouped DataFrame (Mean Salary by Department):
 department
Engineering    70000.0
HR             51000.0
Marketing      45000.0
Name: salary, dtype: float64


# Putting It All Together

In [12]:
import pandas as pd

# Step 1: Read the CSV file
df = pd.read_csv(r'C:\Users\Owner\Downloads\sample_data.csv')

# Step 2: Data Cleaning
# Handle missing values
df.fillna(0, inplace=True)
# Remove duplicates
df.drop_duplicates(inplace=True)

# Step 3: Data Manipulation
# Filter rows where 'age' is greater than 30
filtered_df = df[df['age'] > 30]

# Sort data by 'salary' in descending order
sorted_df = df.sort_values(by='salary', ascending=False)

# Group data by 'department' and calculate the mean salary
grouped_df = df.groupby('department')['salary'].mean()

# Display results
print("Filtered DataFrame:\n", filtered_df)
print("Sorted DataFrame:\n", sorted_df)
print("Grouped DataFrame (Mean Salary by Department):\n", grouped_df)


Filtered DataFrame:
     name  age   department  salary
1    Bob   32  Engineering   60000
3  David   45  Engineering   80000
Sorted DataFrame:
       name  age   department  salary
3    David   45  Engineering   80000
1      Bob   32  Engineering   60000
4      Eva   28           HR   52000
0    Alice   25           HR   50000
2  Charlie   22    Marketing   45000
Grouped DataFrame (Mean Salary by Department):
 department
Engineering    70000.0
HR             51000.0
Marketing      45000.0
Name: salary, dtype: float64
