In [10]:
import pandas as pd

# Create a small dataset
data = {
    'Name': ['John', 'Jane', None, 'Anna', 'Mark'],
    'Age': [25, None, 30, 28, 32],
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
    'Salary': [50000, 60000, 55000, 62000, None]
}

df = pd.DataFrame(data)

# Option 1: Fill missing values (for example, fill 'Age' and 'Salary' with the mean)
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].mean(), inplace=True)
# Option 2: Drop rows with missing 'Name'
# df.dropna(subset=['Name'], inplace=True)

print(df)

   Name    Age  Gender   Salary
0  John  25.00    Male  50000.0
1  Jane  28.75  Female  60000.0
2  None  30.00    Male  55000.0
3  Anna  28.00  Female  62000.0
4  Mark  32.00    Male  56750.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].mean(), inplace=True)


## Handling Missing Values
To handle missing values, you have a few options:

Fill missing values with a default value, mean, or median.
Remove rows or columns with missing data.

In [4]:

# Option 1: Fill missing values (for example, fill 'Age' and 'Salary' with the mean)
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].mean(), inplace=True)

# Option 2: Drop rows with missing 'Name'
df.dropna(subset=['Name'], inplace=True)
# It just drops the entire set of information so instead of 5 it is now 4 names
print(df)


NameError: name 'pd' is not defined

In [12]:
import pandas as pd

# Create a small dataset
data = {
    'Name': ['John', 'Jane', None, 'Anna', 'Mark'],
    'Age': [25, None, 30, 28, 32],
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
    'Salary': [50000, 60000, 55000, 62000, None]
}

df = pd.DataFrame(data)
print(df)

   Name   Age  Gender   Salary
0  John  25.0    Male  50000.0
1  Jane   NaN  Female  60000.0
2  None  30.0    Male  55000.0
3  Anna  28.0  Female  62000.0
4  Mark  32.0    Male      NaN


## Correcting Data Types
Sometimes, data might be loaded as the wrong type. For example, numbers may be read as strings, so you need to convert them.

In [17]:
import pandas as pd

# Create a small dataset
data = {
    'Name': ['John', 'Jane', None, 'Anna', 'Mark'],
    'Age': [25, None, 30, 28, 32],
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
    'Salary': [50000, 60000, 55000, 62000, None]
}

df = pd.DataFrame(data)
# Ensure 'Age' and 'Salary' are of type float or int
df['Age'] = df['Age'].astype(float)
df['Salary'] = df['Salary'].astype(float)
print(df)

   Name   Age  Gender   Salary
0  John  25.0    Male  50000.0
1  Jane   NaN  Female  60000.0
2  None  30.0    Male  55000.0
3  Anna  28.0  Female  62000.0
4  Mark  32.0    Male      NaN


In [25]:
import pandas as pd

# Create a small dataset
data = {
    'Name': ['John', 'Jane', None, 'Anna', 'Mark'],
    'Age': [25, None, 30, 28, 32],
    'Gender': ['male', 'Female', 'male', 'Female', 'Male'],
    'Salary': [50000, 60000, 55000, 62000, None]
}

df = pd.DataFrame(data)
# Standardize 'Gender' column
df['Gender'] = df['Gender'].str.capitalize()
# Just captilize the first letter of the gender male to Male. 
print(df)

   Name   Age  Gender   Salary
0  John  25.0    Male  50000.0
1  Jane   NaN  Female  60000.0
2  None  30.0    Male  55000.0
3  Anna  28.0  Female  62000.0
4  Mark  32.0    Male      NaN


In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from sklearn import preprocessing
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage
import scipy.cluster.hierarchy as shc
from sklearn.decomposition import PCA
import pandas as pd
# Initial dataset
data = {
    'Name': ['John', 'Jane', None, 'Anna', 'Mark'],
    'Age': [25, None, 30, 28, 32],
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
    'Salary': [50000, 60000, 55000, 62000, None]
}

# Create DataFrame
df = pd.DataFrame(data)

# Clean the dataset
# Fill missing 'Age' and 'Salary' with their mean
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].mean(), inplace=True)

# Drop rows with missing 'Name'
df.dropna(subset=['Name'], inplace=True)

# Ensure 'Age' and 'Salary' are of type float
df['Age'] = df['Age'].astype(float)
df['Salary'] = df['Salary'].astype(float)

# Standardize the 'Gender' column
df['Gender'] = df['Gender'].str.capitalize()

# Display the cleaned dataset
import ace_tools as tools; tools.display_dataframe_to_user(name="Cleaned Dataset", dataframe=df)


Result
   Name    Age  Gender   Salary
0  John  25.00    Male  50000.0
1  Jane  28.75  Female  60000.0
3  Anna  28.00  Female  62000.0
4  Mark  32.00    Male  56750.0

In [33]:
# Remove outliers using z-scores (z > 3)
from scipy import stats
df = df[(np.abs(stats.zscore(df['Salary'])) < 3)]
print(df)


   Name    Age  Gender   Salary
0  John  25.00    Male  50000.0
1  Jane  28.75  Female  60000.0
3  Anna  28.00  Female  62000.0
4  Mark  32.00    Male  56750.0


## Data Normalization and Scaling
When you have numerical columns with very different scales (e.g., income in thousands vs. age in years), you might want to normalize or scale the data. This is important for algorithms that rely on distances between data points (e.g., k-nearest neighbors, clustering).

Normalization: Rescaling the data to fit within a specific range (e.g., between 0 and 1).
Standardization: Centering the data around the mean with a unit standard deviation.

In [36]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df['Salary'] = scaler.fit_transform(df[['Salary']])
print(df)
## Dealing with specific range between 0 and 1 convert the values in Salary 

   Name    Age  Gender    Salary
0  John  25.00    Male  0.000000
1  Jane  28.75  Female  0.833333
3  Anna  28.00  Female  1.000000
4  Mark  32.00    Male  0.562500


## Encoding Categorical Variables
If you have categorical data (e.g., "Male", "Female"), many machine learning models can't process them directly. You can encode these categories numerically.

Label Encoding: Assign a unique number to each category.
One-Hot Encoding: Create binary columns for each category.

In [38]:
# One-Hot Encoding for 'Gender' column
df = pd.get_dummies(df, columns=['Gender'])
print(df)
# converting gender to true or false to make computers life easier. 

   Name    Age    Salary  Gender_Female  Gender_Male
0  John  25.00  0.000000          False         True
1  Jane  28.75  0.833333           True        False
3  Anna  28.00  1.000000           True        False
4  Mark  32.00  0.562500          False         True


## Imputing Missing Data
In addition to filling missing values with the mean, you can use other methods to impute missing values:

Median or Mode: For skewed distributions, use the median instead of the mean.
Forward/Backward Fill: For time-series data, you can propagate the last or next valid value.
KNN Imputation: Use the k-nearest neighbors algorithm to impute missing values based on similarity between data points.

In [44]:
# Forward fill missing data in a time-series dataset
df.fillna(method='ffill', inplace=True)
print(df)

   Name    Age    Salary  Gender_Female  Gender_Male
0  John  25.00  0.000000          False         True
1  Jane  28.75  0.833333           True        False
3  Anna  28.00  1.000000           True        False
4  Mark  32.00  0.562500          False         True


  df.fillna(method='ffill', inplace=True)


## Feature Engineering
Sometimes, you can clean a dataset by creating new, more informative features or modifying existing ones:

Binning: Convert continuous data into discrete categories (e.g., bin ages into groups like 20-29, 30-39).
Date Feature Extraction: Extract year, month, or day from a date column for better analysis.

In [47]:
# Binning ages into groups
df['Age_Group'] = pd.cut(df['Age'], bins=[20, 30, 40], labels=['20-30', '30-40'])
print(df)

   Name    Age    Salary  Gender_Female  Gender_Male Age_Group
0  John  25.00  0.000000          False         True     20-30
1  Jane  28.75  0.833333           True        False     20-30
3  Anna  28.00  1.000000           True        False     20-30
4  Mark  32.00  0.562500          False         True     30-40


## Handling Duplicates with Aggregation
Sometimes, you don’t want to remove duplicates outright, but instead aggregate them in a meaningful way (e.g., summing or averaging).

In [51]:
# Group by 'Name' and aggregate salaries
df = df.groupby('Name').agg({'Salary': 'mean'})
print(df)

        Salary
Name          
Anna  1.000000
Jane  0.833333
John  0.000000
Mark  0.562500


## Handling Text Data (Text Cleaning)
For datasets with text data, you can clean the text by:

Removing punctuation or special characters.
Lowercasing text.
Removing stopwords like "and", "the", "is".
Stemming/Lemmatization: Reducing words to their base or root form.

In [56]:
import re

# Example: Clean text data by removing special characters
df['Text'] = df['Text'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))
print(df)

KeyError: 'Text'

## Handling Inconsistent Data Entry
Data entry errors often introduce inconsistencies in the dataset (e.g., "NY", "New York", and "ny" all representing the same city).

Standardize formats: Use string methods to unify the data.


In [59]:
# Standardize the 'City' column to all uppercase
df['City'] = df['City'].str.upper()
print(df)

KeyError: 'City'