In [1]:
pip install pandas



In [2]:
import pandas as pd

In [3]:
# Load the dataset
df = pd.read_csv('/content/sample_data.csv')
print(df)

      Name   Age   Salary   Departmrnt
0    Alice  25.0  50000.0           HR
1      Bob  30.0      NaN  Enginnering
2  Charlie  35.0  70000.0  Enginnering
3    David   NaN  60000.0           HR
4      Eve  28.0  80000.0           HR
5    Frank  40.0  55000.0        Sales
6    Grace  50.0  85000.0        Sales
7     Hank  60.0  90000.0        Sales


In [5]:
# Check for missing values
print(df.isnull().sum())

Name          0
Age           1
Salary        1
Departmrnt    0
dtype: int64


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        8 non-null      object 
 1   Age         7 non-null      float64
 2   Salary      7 non-null      float64
 3   Departmrnt  8 non-null      object 
dtypes: float64(2), object(2)
memory usage: 384.0+ bytes


Removing Rows with Missing Values

In [7]:
# Remove rows with any missing values
df_cleaned = df.dropna()
print(df_cleaned)

      Name   Age   Salary   Departmrnt
0    Alice  25.0  50000.0           HR
2  Charlie  35.0  70000.0  Enginnering
4      Eve  28.0  80000.0           HR
5    Frank  40.0  55000.0        Sales
6    Grace  50.0  85000.0        Sales
7     Hank  60.0  90000.0        Sales


Filling Missing Values

In [8]:
# Fill missing values with a specific value
df_filled = df.fillna(
    {'Age': df['Age'].mean(),
     'Salary': df['Salary'].mean()}
)
print(df_filled)

      Name        Age   Salary   Departmrnt
0    Alice  25.000000  50000.0           HR
1      Bob  30.000000  70000.0  Enginnering
2  Charlie  35.000000  70000.0  Enginnering
3    David  38.285714  60000.0           HR
4      Eve  28.000000  80000.0           HR
5    Frank  40.000000  55000.0        Sales
6    Grace  50.000000  85000.0        Sales
7     Hank  60.000000  90000.0        Sales


Forward Fill Method

In [9]:
# Forward Fill method
df_ffill = df.fillna(method='ffill')
print(df_ffill)

      Name   Age   Salary   Departmrnt
0    Alice  25.0  50000.0           HR
1      Bob  30.0  50000.0  Enginnering
2  Charlie  35.0  70000.0  Enginnering
3    David  35.0  60000.0           HR
4      Eve  28.0  80000.0           HR
5    Frank  40.0  55000.0        Sales
6    Grace  50.0  85000.0        Sales
7     Hank  60.0  90000.0        Sales


Backward fill method

In [10]:
# Backward fill method to propagate the previous values forward
df_bfill = df.fillna(method='bfill')
print(df_bfill)

      Name   Age   Salary   Departmrnt
0    Alice  25.0  50000.0           HR
1      Bob  30.0  70000.0  Enginnering
2  Charlie  35.0  70000.0  Enginnering
3    David  28.0  60000.0           HR
4      Eve  28.0  80000.0           HR
5    Frank  40.0  55000.0        Sales
6    Grace  50.0  85000.0        Sales
7     Hank  60.0  90000.0        Sales


Removing Duplicates

In [13]:
# Add duplicate rows for demonstration
df = pd.concat([df, df.iloc[[0]], df.iloc[[1]]], ignore_index=True)
print('Before revoming duplicates:\n',df)
# Remove duplicate rows
df_no_duplicates = df.drop_duplicates()
print('After removing duplicates:\n',df_no_duplicates)

Before revoming duplicates:
       Name   Age   Salary   Departmrnt
0    Alice  25.0  50000.0           HR
1      Bob  30.0      NaN  Enginnering
2  Charlie  35.0  70000.0  Enginnering
3    David   NaN  60000.0           HR
4      Eve  28.0  80000.0           HR
5    Frank  40.0  55000.0        Sales
6    Grace  50.0  85000.0        Sales
7     Hank  60.0  90000.0        Sales
8    Alice  25.0  50000.0           HR
9      Bob  30.0      NaN  Enginnering
After removing duplicates:
       Name   Age   Salary   Departmrnt
0    Alice  25.0  50000.0           HR
1      Bob  30.0      NaN  Enginnering
2  Charlie  35.0  70000.0  Enginnering
3    David   NaN  60000.0           HR
4      Eve  28.0  80000.0           HR
5    Frank  40.0  55000.0        Sales
6    Grace  50.0  85000.0        Sales
7     Hank  60.0  90000.0        Sales


Replacing Incorrect Values

In [15]:
# Replace incorrect values in the 'Department' column
df_corrected = df.replace({'Departmrnt': {'HR': 'Human Resources', 'Sales': 'Sales Department'}})
print(df_corrected)

      Name   Age   Salary        Departmrnt
0    Alice  25.0  50000.0   Human Resources
1      Bob  30.0      NaN       Enginnering
2  Charlie  35.0  70000.0       Enginnering
3    David   NaN  60000.0   Human Resources
4      Eve  28.0  80000.0   Human Resources
5    Frank  40.0  55000.0  Sales Department
6    Grace  50.0  85000.0  Sales Department
7     Hank  60.0  90000.0  Sales Department
8    Alice  25.0  50000.0   Human Resources
9      Bob  30.0      NaN       Enginnering


Ensuring Consistency

In [17]:
# Convert all dempartment name to lowercase
df['Departmrnt'] = df['Departmrnt'].str.lower()
print(df)

      Name   Age   Salary   Departmrnt
0    Alice  25.0  50000.0           hr
1      Bob  30.0      NaN  enginnering
2  Charlie  35.0  70000.0  enginnering
3    David   NaN  60000.0           hr
4      Eve  28.0  80000.0           hr
5    Frank  40.0  55000.0        sales
6    Grace  50.0  85000.0        sales
7     Hank  60.0  90000.0        sales
8    Alice  25.0  50000.0           hr
9      Bob  30.0      NaN  enginnering


Min-Max Normalization

In [18]:
df_normalized = df.copy()
for col in ['Age','Salary']:
  df_normalized[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
print ("Original DataFrame")
print(df)
print("\nNormalized DataFrame")
print(df_normalized)


Original DataFrame
      Name   Age   Salary   Departmrnt
0    Alice  25.0  50000.0           hr
1      Bob  30.0      NaN  enginnering
2  Charlie  35.0  70000.0  enginnering
3    David   NaN  60000.0           hr
4      Eve  28.0  80000.0           hr
5    Frank  40.0  55000.0        sales
6    Grace  50.0  85000.0        sales
7     Hank  60.0  90000.0        sales
8    Alice  25.0  50000.0           hr
9      Bob  30.0      NaN  enginnering

Normalized DataFrame
      Name       Age  Salary   Departmrnt
0    Alice  0.000000   0.000           hr
1      Bob  0.142857     NaN  enginnering
2  Charlie  0.285714   0.500  enginnering
3    David       NaN   0.250           hr
4      Eve  0.085714   0.750           hr
5    Frank  0.428571   0.125        sales
6    Grace  0.714286   0.875        sales
7     Hank  1.000000   1.000        sales
8    Alice  0.000000   0.000           hr
9      Bob  0.142857     NaN  enginnering
