In [10]:
import pandas as pd
import numpy as np


from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Identify outliers using Z-score
from scipy import stats

# 1. Loading the dummy data

## Step 1: Generate and load the dummy data 


In [31]:
# Create a dummy dataset
np.random.seed(0)
dummy_data = {
    'Feature1': np.random.normal(100, 10, 100).tolist() + [np.nan, 200],  # Normally distributed with an outlier
    'Feature2': np.random.randint(0, 100, 102).tolist(),  # Random integers
    'Category': ['A', 'B', 'C', 'D'] * 25 + [np.nan, 'A'],  # Categorical with some missing values
    'Target': np.random.choice([0, 1], 102).tolist()  # Binary target variable
}

# Convert the dictionary to a pandas DataFrame
df_dummy = pd.DataFrame(dummy_data)

# Display the first few rows of the dummy dataset
df_dummy

Unnamed: 0,Feature1,Feature2,Category,Target
0,117.640523,32,A,1
1,104.001572,70,B,1
2,109.787380,85,C,0
3,122.408932,31,D,1
4,118.675580,13,A,0
...,...,...,...,...
97,117.858705,35,B,1
98,101.269121,30,C,1
99,104.019894,29,D,1
100,,33,,1


# 2. Applying the preprocessing tool 

Next, use the preprocessing tool you set up in the previous lesson to clean and preprocess this dummy data, making it ready for ML.

## Step 2: Load the preprocessing tool 

Ensure your preprocessing functions are loaded into your environment. These functions include handling missing values, removing outliers, scaling data, and encoding categorical variables.

In [20]:
def load_data(df):
    return df

def handle_missing_values(df):
    df_copy = df.copy()
    
    # Handle missing values in numeric columns with mean
    numeric_columns = df_copy.select_dtypes(include=[np.number]).columns
    for col in numeric_columns:
        df_copy[col] = df_copy[col].fillna(df_copy[col].mean())
    
    # Handle missing values in categorical columns with mode (most frequent value)
    categorical_columns = df_copy.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        df_copy[col] = df_copy[col].fillna(df_copy[col].mode()[0] if not df_copy[col].mode().empty else 'Unknown')
    
    return df_copy

def remove_outliers(df):
    z_scores = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))
    return df[(z_scores < 3).all(axis=1)]  # Remove rows with any outliers

def scale_data(df):
    scaler = StandardScaler()
    df[df.select_dtypes(include=[np.number]).columns] = scaler.fit_transform(df.select_dtypes(include=[np.number]))
    return df

def encode_categorical(df, categorical_columns):
    return pd.get_dummies(df, columns=categorical_columns)

def save_data(df, output_filepath):
    df.to_csv(output_filepath, index=False)

Explanation: These functions encapsulate the core preprocessing tasks, making them reusable across different datasets. They will be applied to our dummy data.

## Step 3: Preprocess the dummy data

Apply the preprocessing tool to the dummy data:



In [24]:
# Load the data
df_preprocessed = load_data(df_dummy)

# Handle missing values
df_preprocessed = handle_missing_values(df_preprocessed)

# Remove outliers
df_preprocessed = remove_outliers(df_preprocessed)

# Scale the data
df_preprocessed = scale_data(df_preprocessed)

# Encode categorical variables
df_preprocessed = encode_categorical(df_preprocessed, ['Category'])

# Display the preprocessed data
df_preprocessed

Unnamed: 0,Feature1,Feature2,Target,Category_A,Category_B,Category_C,Category_D
0,1.698298,-0.519379,0.932936,True,False,False,False
1,0.338384,0.887380,0.932936,False,True,False,False
2,0.915276,1.442679,-1.071884,False,False,True,False
3,2.173747,-0.556399,0.932936,False,False,False,True
4,1.801501,-1.222759,-1.071884,True,False,False,False
...,...,...,...,...,...,...,...
96,-0.050136,0.776320,-1.071884,True,False,False,False
97,1.720052,-0.408319,0.932936,False,True,False,False
98,0.065937,-0.593419,0.932936,False,False,True,False
99,0.340211,-0.630439,0.932936,False,False,False,True


Explanation: This code applies the preprocessing steps to the dummy data. It handles missing values by filling them with the mean, removes outliers using the Z-score method, scales the numeric data, and encodes the categorical variables using one-hot encoding.

# 3. Saving the preprocessed data

Finally, save the preprocessed data to a new comma-separated values (CSV) file for use in ML tasks. 


## Step 4: Save the preprocessed data 


In [32]:
# Save the cleaned and preprocessed DataFrame to a CSV file
save_data(df_preprocessed, 'preprocessed_dummy_data.csv')

print('Preprocessing complete. Preprocessed data saved as preprocessed_dummy_data.csv')

Preprocessing complete. Preprocessed data saved as preprocessed_dummy_data.csv


Explanation: Saving the preprocessed data to a new file ensures that it’s ready for use in training ML models. This step makes it easy to use the cleaned and processed data in future analysis or modeling efforts.



# 4. Verifying the preprocessing steps 

After preprocessing, it’s important to verify that the data has been processed correctly:



Check for missing values: 

Explanation: This checks that all missing values. have been handled properly.


In [40]:
df_preprocessed.isnull().sum()

Feature1      0
Feature2      0
Target        0
Category_A    0
Category_B    0
Category_C    0
Category_D    0
dtype: int64

Verify outlier removal:

Explanation: This summarizes the dataset and confirms that any extreme values (outliers). have been removed.


In [39]:
df_preprocessed.describe()

Unnamed: 0,Feature1,Feature2,Target
count,101.0,101.0,101.0
mean,-2.526444e-15,-3.407615e-17,-2.4183080000000002e-17
std,1.004988,1.004988,1.004988
min,-2.606142,-1.704018,-1.071884
25%,-0.6930755,-0.667459,-1.071884
50%,0.06071482,-0.1861994,0.9329364
75%,0.6663572,0.8503597,0.9329364
max,2.202524,1.886919,0.9329364


Inspect scaled data: 

Explanation: This ensures that the numeric features have been scaled properly, making them ready for ML algorithms.



In [37]:
df_preprocessed.head()

Unnamed: 0,Feature1,Feature2,Target,Category_A,Category_B,Category_C,Category_D
0,1.698298,-0.519379,0.932936,True,False,False,False
1,0.338384,0.88738,0.932936,False,True,False,False
2,0.915276,1.442679,-1.071884,False,False,True,False
3,2.173747,-0.556399,0.932936,False,False,False,True
4,1.801501,-1.222759,-1.071884,True,False,False,False


Check categorical encoding:

Explanation: This confirms that the categorical variables have been encoded into numerical values correctly.



In [38]:
df_preprocessed.columns

Index(['Feature1', 'Feature2', 'Target', 'Category_A', 'Category_B',
       'Category_C', 'Category_D'],
      dtype='object')