In [3]:
#missing value handling

import pandas as pd

# Sample DataFrame with missing values
data = {
    'A': [1, 2, None, 4, 5],
    'B': [None, 2, 3, None, 5],
    'C': [1, 2, 3, 4, 5]
}
df = pd.DataFrame(data)

# Method 1: Drop rows with missing values
df_dropped = df.dropna()

# Method 2: Fill missing values with a specified value
df_filled = df.fillna(0)  # Replace missing values with 0

# Method 3: Fill missing values with the mean of the column
df_mean_filled = df.fillna(df.mean())

# Method 4: Fill missing values with the median of the column
df_median_filled = df.fillna(df.median())

# Method 5: Interpolation
df_interpolated = df.interpolate()

# Print the original DataFrame and the transformed DataFrames
print("Original DataFrame:")
print(df)
print("\nDataFrame after dropping rows with missing values:")
print(df_dropped)
print("\nDataFrame after filling missing values with 0:")
print(df_filled)
print("\nDataFrame after filling missing values with column mean:")
print(df_mean_filled)
print("\nDataFrame after filling missing values with column median:")
print(df_median_filled)
print("\nDataFrame after interpolation:")
print(df_interpolated)


Original DataFrame:
     A    B  C
0  1.0  NaN  1
1  2.0  2.0  2
2  NaN  3.0  3
3  4.0  NaN  4
4  5.0  5.0  5

DataFrame after dropping rows with missing values:
     A    B  C
1  2.0  2.0  2
4  5.0  5.0  5

DataFrame after filling missing values with 0:
     A    B  C
0  1.0  0.0  1
1  2.0  2.0  2
2  0.0  3.0  3
3  4.0  0.0  4
4  5.0  5.0  5

DataFrame after filling missing values with column mean:
     A         B  C
0  1.0  3.333333  1
1  2.0  2.000000  2
2  3.0  3.000000  3
3  4.0  3.333333  4
4  5.0  5.000000  5

DataFrame after filling missing values with column median:
     A    B  C
0  1.0  3.0  1
1  2.0  2.0  2
2  3.0  3.0  3
3  4.0  3.0  4
4  5.0  5.0  5

DataFrame after interpolation:
     A    B  C
0  1.0  NaN  1
1  2.0  2.0  2
2  3.0  3.0  3
3  4.0  4.0  4
4  5.0  5.0  5


In [9]:
#outliers data
import pandas as pd
import numpy as np

def handle_outliers_zscore(data, columns, threshold=3):
    """
    Handle outliers using Z-score method.

    Parameters:
        data (DataFrame): Input DataFrame.
        columns (list): List of columns to handle outliers.
        threshold (float): Threshold value for Z-score. Observations with Z-score
                           greater than this threshold will be considered as outliers.

    Returns:
        DataFrame: Data with outliers replaced with NaNs.
    """
    data_cleaned = data.copy()
    for col in columns:
        z_scores = (data[col] - data[col].mean()) / data[col].std()
        outlier_indices = np.abs(z_scores) > threshold
        data_cleaned.loc[outlier_indices, col] = np.nan
    return data_cleaned

# Example usage
# Load your dataset
# For demonstration, let's create a simple dataset
data = {
    'A': [1, 2, 3, 4, 5, 100],
    'B': [10, 20, 30, 40, 50, 1000]
}
df = pd.DataFrame(data)

# Handle outliers for columns 'A' and 'B'
cleaned_df = handle_outliers_zscore(df, columns=['A', 'B'])

# Display the cleaned dataset
print("Original Dataset:")
print(df)
print("\nDataset after handling outliers with Z-score method:")
print(cleaned_df)


Original Dataset:
     A     B
0    1    10
1    2    20
2    3    30
3    4    40
4    5    50
5  100  1000

Dataset after handling outliers with Z-score method:
       A       B
0    1.0    10.0
1    2.0    20.0
2    3.0    30.0
3    4.0    40.0
4    5.0    50.0
5  100.0  1000.0


In [10]:

import pandas as pd

# Load your dataset
# For demonstration, let's create a simple dataset
data = {
    'Name': ['John', 'Jane', 'Alice', 'Bob'],
    'Age': [25, 30, 35, 40],
    'Gender': ['Male', 'Female', 'Female', 'Male'],
    'Income': [50000, 60000, 70000, 80000],
    'Irrelevant_Column': ['A', 'B', 'C', 'D']  # This column is considered irrelevant
}
df = pd.DataFrame(data)

# Display the original dataset
print("Original Dataset:")
print(df)

# Remove irrelevant column
df_cleaned = df.drop(columns=['Irrelevant_Column'])

# Display the dataset after removing irrelevant column
print("\nDataset after removing irrelevant column:")
print(df_cleaned)


Original Dataset:
    Name  Age  Gender  Income Irrelevant_Column
0   John   25    Male   50000                 A
1   Jane   30  Female   60000                 B
2  Alice   35  Female   70000                 C
3    Bob   40    Male   80000                 D

Dataset after removing irrelevant column:
    Name  Age  Gender  Income
0   John   25    Male   50000
1   Jane   30  Female   60000
2  Alice   35  Female   70000
3    Bob   40    Male   80000
