# Preprocessing

## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import os

## Data sources

-  [Pokemon Power](https://www.kaggle.com/datasets/sujithmandala/pokmon-combat-power-prediction)
-  [Auto MPG](https://www.kaggle.com/datasets/yasserh/auto-mpg-dataset)
-  [Insurance Payout](https://www.kaggle.com/datasets/harshsingh2209/medical-insurance-payout)
-  [Concrete Strength](https://www.kaggle.com/datasets/prathamtripathi/regression-with-neural-networking)
-  [Laptop Price](https://www.kaggle.com/datasets/gyanprakashkushwaha/laptop-price-prediction-cleaned-dataset)
-  [Income Household](https://www.kaggle.com/datasets/stealthtechnologies/regression-dataset-for-household-income-analysis)

## Load data

In [2]:
df1 = pd.read_csv("../data/pokemon.csv")
df1.head()

Unnamed: 0,Name,Type 1,Type 2,Combat Power,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,Charmander,Fire,,309,39,52,43,60,50,65,1,False
4,Charmeleon,Fire,,405,58,64,58,80,65,80,1,False


In [3]:
df2 = pd.read_csv("../data/auto-mpg.csv")
df2.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [4]:
df3 = pd.read_csv("../data/insurance.csv")
df3.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
df4 = pd.read_csv("../data/concrete_data.csv")
df4.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [6]:
df5 = pd.read_csv("../data/laptop_data_cleaned.csv")
df5.head()

Unnamed: 0,Company,TypeName,Ram,Weight,Price,TouchScreen,Ips,Ppi,Cpu_brand,HDD,SSD,Gpu_brand,Os
0,Apple,Ultrabook,8,1.37,11.175755,0,1,226.983005,Intel Core i5,0,128,Intel,Mac
1,Apple,Ultrabook,8,1.34,10.776777,0,0,127.67794,Intel Core i5,0,0,Intel,Mac
2,HP,Notebook,8,1.86,10.329931,0,0,141.211998,Intel Core i5,0,256,Intel,Others
3,Apple,Ultrabook,16,1.83,11.814476,0,1,220.534624,Intel Core i7,0,512,AMD,Mac
4,Apple,Ultrabook,8,1.37,11.473101,0,1,226.983005,Intel Core i5,0,256,Intel,Mac


In [7]:
df6 = pd.read_csv("../data/household_income.csv")
df6.head()

Unnamed: 0,Age,Education_Level,Occupation,Number_of_Dependents,Location,Work_Experience,Marital_Status,Employment_Status,Household_Size,Homeownership_Status,Type_of_Housing,Gender,Primary_Mode_of_Transportation,Income
0,56,Master's,Technology,5,Urban,21,Married,Full-time,7,Own,Apartment,Male,Public transit,72510
1,69,High School,Finance,0,Urban,4,Single,Full-time,7,Own,Apartment,Male,Biking,75462
2,46,Bachelor's,Technology,1,Urban,1,Single,Full-time,7,Own,Single-family home,Female,Car,71748
3,32,High School,Others,2,Urban,32,Married,Full-time,1,Own,Apartment,Female,Car,74520
4,60,Bachelor's,Finance,3,Urban,15,Married,Self-employed,4,Own,Townhouse,Male,Walking,640210


## Quick fixes (not covered in next common changes section)

In [8]:
def print_unique_values(df):
    object_cols = df.select_dtypes(include='object')

    for col in object_cols.columns:
        unique_count = df[col].nunique()
        print(f"Column '{col}' has {unique_count} unique value(s).")

In [9]:
for i, df in enumerate([df1, df2, df3, df4, df5, df6]):
    print(f"In df{i+1}")
    print_unique_values(df)

In df1
Column 'Name' has 151 unique value(s).
Column 'Type 1' has 15 unique value(s).
Column 'Type 2' has 11 unique value(s).
In df2
Column 'horsepower' has 94 unique value(s).
Column 'car name' has 305 unique value(s).
In df3
Column 'sex' has 2 unique value(s).
Column 'smoker' has 2 unique value(s).
Column 'region' has 4 unique value(s).
In df4
In df5
Column 'Company' has 19 unique value(s).
Column 'TypeName' has 6 unique value(s).
Column 'Cpu_brand' has 5 unique value(s).
Column 'Gpu_brand' has 3 unique value(s).
Column 'Os' has 3 unique value(s).
In df6
Column 'Education_Level' has 4 unique value(s).
Column 'Occupation' has 5 unique value(s).
Column 'Location' has 3 unique value(s).
Column 'Marital_Status' has 3 unique value(s).
Column 'Employment_Status' has 3 unique value(s).
Column 'Homeownership_Status' has 2 unique value(s).
Column 'Type_of_Housing' has 3 unique value(s).
Column 'Gender' has 2 unique value(s).
Column 'Primary_Mode_of_Transportation' has 4 unique value(s).


### df1

In [10]:
df1.shape

(151, 12)

Ok, Name col in Pokemon (df1) dataset should be deleted as its useless

In [11]:
df1.drop(columns=["Name"], inplace=True)
df1.head()

Unnamed: 0,Type 1,Type 2,Combat Power,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,Grass,Poison,318,45,49,49,65,65,45,1,False
1,Grass,Poison,405,60,62,63,80,80,60,1,False
2,Grass,Poison,525,80,82,83,100,100,80,1,False
3,Fire,,309,39,52,43,60,50,65,1,False
4,Fire,,405,58,64,58,80,65,80,1,False


### df2

In [12]:
df2.shape

(398, 9)

From car name we can extract brand of car simply by doing:

In [13]:
df2["car name"].apply(lambda x: x.split()[0])

0      chevrolet
1          buick
2       plymouth
3            amc
4           ford
         ...    
393         ford
394           vw
395        dodge
396         ford
397        chevy
Name: car name, Length: 398, dtype: object

In [14]:
df2["car name"].apply(lambda x: x.split()[0]).value_counts()

car name
ford             51
chevrolet        43
plymouth         31
amc              28
dodge            28
toyota           25
datsun           23
buick            17
pontiac          16
volkswagen       15
honda            13
mercury          11
mazda            10
oldsmobile       10
fiat              8
peugeot           8
audi              7
chrysler          6
vw                6
volvo             6
renault           5
saab              4
subaru            4
opel              4
chevy             3
bmw               2
cadillac          2
maxda             2
mercedes-benz     2
triumph           1
vokswagen         1
mercedes          1
hi                1
capri             1
chevroelt         1
toyouta           1
nissan            1
Name: count, dtype: int64

We can also map some of them to correct one names manually (maxda and mazda are probably the same)

In [15]:
def mapping(name):
    if name == 'chevy' or name == 'chevroelt':
        return 'chevrolet'
    elif name == 'maxda':
        return 'mazda'
    elif name == 'vokswagen' or name == 'vw':
        return 'volkswagen'
    elif name == 'toyouta':
        return 'toyota'
    elif name == 'mercedes':
        return 'mercedes-benz'
    else:
        return name

In [16]:
df2["car name"].apply(lambda x: x.split()[0]).map(mapping).value_counts()

car name
ford             51
chevrolet        47
plymouth         31
amc              28
dodge            28
toyota           26
datsun           23
volkswagen       22
buick            17
pontiac          16
honda            13
mazda            12
mercury          11
oldsmobile       10
fiat              8
peugeot           8
audi              7
volvo             6
chrysler          6
renault           5
saab              4
opel              4
subaru            4
mercedes-benz     3
cadillac          2
bmw               2
capri             1
hi                1
triumph           1
nissan            1
Name: count, dtype: int64

And finally I map small ones to others (when val count < 6)

In [17]:
my_series = df2["car name"].apply(lambda x: x.split()[0]).map(mapping)
value_counts = my_series.value_counts()

common_values = value_counts[value_counts >= 6].index

mapped_series = my_series.where(my_series.isin(common_values), other='other')

df2["brand"] = mapped_series
print(mapped_series.value_counts())

car name
ford          51
chevrolet     47
plymouth      31
other         28
amc           28
dodge         28
toyota        26
datsun        23
volkswagen    22
buick         17
pontiac       16
honda         13
mazda         12
mercury       11
oldsmobile    10
fiat           8
peugeot        8
audi           7
chrysler       6
volvo          6
Name: count, dtype: int64


In [18]:
# Rest from car name i will drop
df2.drop(columns=["car name"], inplace=True)

Also horsepower was a problem as it was object type somehow (it occurs that there is ? as un)

In [19]:
df2['horsepower'].value_counts()[20:21]

horsepower
?    6
Name: count, dtype: int64

In [20]:
df2['horsepower'] = df2['horsepower'].map(lambda x: np.nan if x == "?" else x)

In [21]:
df2['horsepower'] = df2['horsepower'].astype(np.float64)

In [22]:
df2.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,brand
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford


### df5

In [23]:
df5.Company.value_counts()

Company
Dell         291
Lenovo       289
HP           268
Asus         151
Acer         101
MSI           54
Toshiba       48
Apple         21
Samsung        8
Razer          7
Mediacom       7
Microsoft      6
Xiaomi         4
Vero           4
Chuwi          3
Google         3
Fujitsu        3
LG             3
Huawei         2
Name: count, dtype: int64

In [24]:
# Same as in df2 map brands with small count to others
my_series = df5["Company"].apply(lambda x: x.split()[0]).map(mapping)
value_counts = my_series.value_counts()

common_values = value_counts[value_counts >= 6].index

mapped_series = my_series.where(my_series.isin(common_values), other='Other')

df5["Company"] = my_series
print(mapped_series.value_counts())

Company
Dell         291
Lenovo       289
HP           268
Asus         151
Acer         101
MSI           54
Toshiba       48
Other         22
Apple         21
Samsung        8
Razer          7
Mediacom       7
Microsoft      6
Name: count, dtype: int64


## Common changes

In [25]:
datasets = [
    (df1, 'Combat Power'),
    (df2, 'mpg'),
    (df3, 'charges'),
    (df4, 'Strength'),
    (df5, 'Price'),
    (df6, 'Income')
]

In [26]:
# Taken from preprocessing_jrsh.ipynb
def preprocess_dataset(df, target_column=None):
    print(f"Original shape: {df.shape}")
    
    # duplicates
    df = df.drop_duplicates()
    print(f"After removing duplicates: {df.shape}")
    
    # columns with a single unique value
    single_value_cols = [col for col in df.columns if df[col].nunique() <= 1]
    df = df.drop(columns=single_value_cols)
    if single_value_cols:
        print(f"Dropped columns with single value: {single_value_cols}")
    print(f"After dropping single-value columns: {df.shape}")
    
    # handle target column if is categorical
    if target_column is not None:
        is_categorical = df[target_column].dtype == 'object' or df[target_column].dtype == 'category' or df[target_column].dtype == 'bool'
        is_categorical = is_categorical or (df[target_column].dtype.kind in 'ifu' and df[target_column].nunique() < 10)
        
        if is_categorical:
            class_counts = df[target_column].value_counts()
            classes_to_keep = class_counts[class_counts > 1].index
            df = df[df[target_column].isin(classes_to_keep)]
            print(f"Target is categorical - after removing rare classes: {df.shape}")
        else:
            print(f"Target is continuous - skipping removal of rare classes")
    
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
    
    if target_column and target_column in numeric_cols:
        numeric_cols.remove(target_column)
    if target_column and target_column in categorical_cols:
        categorical_cols.remove(target_column)
    
    binary_categorical_cols = [col for col in categorical_cols if df[col].nunique() == 2]
    non_binary_categorical_cols = [col for col in categorical_cols if col not in binary_categorical_cols]
    
    print(f"Numeric columns: {len(numeric_cols)}")
    print(f"Binary categorical columns: {len(binary_categorical_cols)}")
    print(f"Non-binary categorical columns: {len(non_binary_categorical_cols)}")
    
    # Handle NaNs
    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].mean())
    
    for col in categorical_cols:
        df[col] = df[col].fillna('missing')
    
    # one-hot encoding
    if binary_categorical_cols or non_binary_categorical_cols:
        if non_binary_categorical_cols:
            df = pd.get_dummies(df, columns=non_binary_categorical_cols, dummy_na=False)
        
        if binary_categorical_cols:
            df = pd.get_dummies(df, columns=binary_categorical_cols, drop_first=True, dummy_na=False)

        dummy_cols = [col for col in df.columns if col not in numeric_cols + categorical_cols]
        if target_column:
            dummy_cols = [col for col in dummy_cols if col != target_column]
        for col in dummy_cols:
            if df[col].dtype == bool:
                df[col] = df[col].astype(int)
        
        print(f"Shape after one-hot encoding: {df.shape}")
    
    if target_column:
        X = df.drop(columns=[target_column])
        y = df[target_column].copy()
    else:
        X = df.copy()
        y = None
    
    # standarize numeric features
    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    if numeric_cols:
        scaler = StandardScaler()
        X[numeric_cols] = scaler.fit_transform(X[numeric_cols])
    
    if target_column:
        processed_df = pd.concat([X, y], axis=1)
    else:
        processed_df = X
    
    print(f"Final shape after preprocessing: {processed_df.shape}")
    return processed_df

In [27]:
processed_dfs = []
for i, (df, target) in enumerate(datasets):
    print(f"\nProcessing dataset {i+1}:")
    processed_df = preprocess_dataset(df, target)
    processed_dfs.append(processed_df)
    print(f"First 5 rows of processed dataset {i+1}:")
    print(processed_df.head())


Processing dataset 1:
Original shape: (151, 11)
After removing duplicates: (151, 11)
Dropped columns with single value: ['Generation']
After dropping single-value columns: (151, 10)
Target is continuous - skipping removal of rare classes
Numeric columns: 6
Binary categorical columns: 1
Non-binary categorical columns: 2
Shape after one-hot encoding: (151, 35)
Final shape after preprocessing: (151, 35)
First 5 rows of processed dataset 1:
         HP    Attack   Defense   Sp. Atk   Sp. Def     Speed  Type 1_Bug  \
0 -0.674214 -0.888400 -0.716624 -0.075215 -0.042287 -0.893975           0   
1 -0.147811 -0.397981 -0.194770  0.452220  0.579664 -0.335241           0   
2  0.554059  0.356509  0.550736  1.155465  1.408931  0.409739           0   
3 -0.884775 -0.775227 -0.940275 -0.251026 -0.664238 -0.148996           0   
4 -0.217998 -0.322532 -0.381146  0.452220 -0.042287  0.409739           0   

   Type 1_Dragon  Type 1_Electric  Type 1_Fairy  ...  Type 2_Ground  \
0              0        

## Save data

In [28]:
# Save data
output_dir = './../processed_data/preprocessed_bj'
os.makedirs(output_dir, exist_ok=True)

dataset_names = [
    'pokemon_combat',
    'mpg',
    'insurance_payout',
    'concrete_strength',
    'laptop_price',
    'income_household'
]

for i, (df_processed, name) in enumerate(zip(processed_dfs, dataset_names)):
    output_path = os.path.join(output_dir, f"{name}_processed.csv")
    df_processed.to_csv(output_path, index=False)
    print(f"Dataset {i+1} saved to {output_path}")
    print(f"Shape: {df_processed.shape}")

Dataset 1 saved to ./../processed_data/preprocessed_bj\pokemon_combat_processed.csv
Shape: (151, 35)
Dataset 2 saved to ./../processed_data/preprocessed_bj\mpg_processed.csv
Shape: (398, 28)
Dataset 3 saved to ./../processed_data/preprocessed_bj\insurance_payout_processed.csv
Shape: (1337, 10)
Dataset 4 saved to ./../processed_data/preprocessed_bj\concrete_strength_processed.csv
Shape: (1005, 9)
Dataset 5 saved to ./../processed_data/preprocessed_bj\laptop_price_processed.csv
Shape: (1272, 44)
Dataset 6 saved to ./../processed_data/preprocessed_bj\income_household_processed.csv
Shape: (10000, 32)
