In [1]:
import numpy as np
import pandas as pd

# Example data
data = pd.read_csv("numeric_data1.csv")

# Separate features and class label
features = data.iloc[:, :-1].apply(pd.to_numeric, errors='coerce')
class_label = data.iloc[:, -1]

# 1. Handling missing values
def handle_missing_values(features):
    # Calculate column-wise means excluding NaN values
    column_means = {}
    for column in features:
        values = [float(value) for value in features[column] if not pd.isna(value)]
        if values:
            column_means[column] = np.mean(values)
        else:
            # If all values in the column are NaN, fill with 0
            column_means[column] = 0

    # Replace NaN values with the mean of the respective column
    for column in features:
        features[column].fillna(column_means[column], inplace=True)
   
    return features

# 2. Remove duplicates
def remove_duplicates(features):
    # Check for duplicates
    if features.duplicated().any():
        print("Duplicates found and will be removed.")
        features = features.drop_duplicates()
    else:
        print("No duplicates found.")
    return features

# Apply handling missing values
features = handle_missing_values(features)
print("Data after handling missing values:")
print(features)

# Apply duplicate removal
features = remove_duplicates(features)
print("Data after removing duplicates:")
print(features)

# Also remove the corresponding rows from the class label
class_label_no_duplicates = class_label.loc[features.index]

# Reattach the class label
final_data = pd.concat([features, class_label_no_duplicates.reset_index(drop=True)], axis=1)
print("Final data with class label:")
print(final_data)

Data after handling missing values:
            f1        f2      f3      f4       f5        f6        f7      f8  \
0    -0.439400  0.141133  0.1718  0.4620  0.62260  0.163549  0.357800  0.0478   
1    -0.434800 -0.119800  0.2474  0.4036  0.32426  0.632800  0.494800  0.0338   
2    -0.233000  0.212400  0.5014  0.5222 -0.34220 -0.584000  0.024032 -0.6342   
3    -0.385396 -0.009600  0.2602  0.2554 -0.42900 -0.674600 -0.686800 -0.6650   
4    -0.341200  0.094600  0.6082  0.6216 -0.16220 -0.378400 -0.432400 -0.4358   
...        ...       ...     ...     ...      ...       ...       ...     ...   
7792 -0.684200 -0.328000 -0.1984  0.2956  0.87860  0.894800  0.311800  0.1822   
7793 -0.591200 -0.242000  0.8174  1.0000  0.46420  0.642800  0.694400  0.3056   
7794 -0.669600 -0.373000  0.1584  0.8910  1.00000  0.976200  0.976200  0.7684   
7795 -0.576400 -0.176400  0.5106  0.3742 -0.16700 -0.585800 -0.788200 -0.7224   
7796 -0.662400 -0.333400  0.3666  0.4292 -0.20840 -0.537400 -0.454200 -0.