# Missing Value Estimation 1

In [1]:
# Useful libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report
from sklearn import metrics  
from sklearn import tree

In [2]:
df = pd.read_csv("MissingData1.csv", header=None)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,-0.11,0.02,-3.600000e-01,-1.100000e-01,4.800000e-01,-0.20,0.27,0.29,-0.34,-0.05,0.23,0.08,-0.45,0.25
1,-0.30,-0.37,-1.800000e-01,-9.000000e-02,-1.600000e-01,-0.16,-0.10,-0.09,0.46,0.32,0.02,0.31,0.40,-0.07
2,0.50,0.18,4.100000e-01,1.000000e+99,1.000000e+99,0.15,-0.25,-0.41,-0.07,-0.13,-0.15,-0.01,-0.28,-0.09
3,0.00,-0.11,1.000000e+99,1.900000e-01,0.000000e+00,-0.07,0.18,0.18,-0.23,-0.16,0.03,-0.12,-0.11,0.23
4,0.40,-0.16,3.100000e-01,-3.400000e-01,2.000000e-02,0.21,-0.70,0.11,0.08,0.05,-0.09,0.07,0.04,-0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,-0.45,-0.31,-2.600000e-01,-7.400000e-01,-6.000000e-02,0.09,-0.37,0.38,0.35,0.45,0.13,0.27,0.31,0.19
238,0.12,-0.27,3.000000e-02,3.600000e-01,1.090000e+00,0.41,0.15,0.01,-0.22,-0.01,-0.09,-0.55,-0.55,-0.49
239,1.31,0.05,-1.700000e-01,2.300000e-01,-2.500000e-01,-0.76,-0.06,-0.22,0.36,-0.17,-0.46,-0.06,0.21,-0.01
240,0.34,0.03,-1.200000e-01,-3.000000e-02,3.300000e-01,-0.23,-0.08,0.20,-0.08,0.08,-0.12,-0.14,0.12,-0.30


In [3]:
#Replace all 1.000000e+99 with nan
df[df > 1000] = np.nan

# Total number of entries
total = df.size

# Count of NaN values
nan_count = df.isna().sum().sum()

# Count of Non-NaN values
non_nan_count = total - nan_count
print("Missing value percentage:",nan_count/total * 100,"%")
df

Missing value percentage: 3.4828807556080283 %


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,-0.11,0.02,-0.36,-0.11,0.48,-0.20,0.27,0.29,-0.34,-0.05,0.23,0.08,-0.45,0.25
1,-0.30,-0.37,-0.18,-0.09,-0.16,-0.16,-0.10,-0.09,0.46,0.32,0.02,0.31,0.40,-0.07
2,0.50,0.18,0.41,,,0.15,-0.25,-0.41,-0.07,-0.13,-0.15,-0.01,-0.28,-0.09
3,0.00,-0.11,,0.19,0.00,-0.07,0.18,0.18,-0.23,-0.16,0.03,-0.12,-0.11,0.23
4,0.40,-0.16,0.31,-0.34,0.02,0.21,-0.70,0.11,0.08,0.05,-0.09,0.07,0.04,-0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,-0.45,-0.31,-0.26,-0.74,-0.06,0.09,-0.37,0.38,0.35,0.45,0.13,0.27,0.31,0.19
238,0.12,-0.27,0.03,0.36,1.09,0.41,0.15,0.01,-0.22,-0.01,-0.09,-0.55,-0.55,-0.49
239,1.31,0.05,-0.17,0.23,-0.25,-0.76,-0.06,-0.22,0.36,-0.17,-0.46,-0.06,0.21,-0.01
240,0.34,0.03,-0.12,-0.03,0.33,-0.23,-0.08,0.20,-0.08,0.08,-0.12,-0.14,0.12,-0.30


In [4]:
# Initialize the KNNImputer
imputer = KNNImputer(n_neighbors=2) 

# Fit and transform the data
imputed_data = imputer.fit_transform(df)

# Convert the result back to a DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=df.columns)

print("\nDataFrame After KNN Imputation:")
imputed_df


DataFrame After KNN Imputation:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,-0.11,0.02,-0.360,-0.11,0.48,-0.20,0.27,0.29,-0.34,-0.05,0.23,0.08,-0.45,0.25
1,-0.30,-0.37,-0.180,-0.09,-0.16,-0.16,-0.10,-0.09,0.46,0.32,0.02,0.31,0.40,-0.07
2,0.50,0.18,0.410,0.12,-0.01,0.15,-0.25,-0.41,-0.07,-0.13,-0.15,-0.01,-0.28,-0.09
3,0.00,-0.11,-0.015,0.19,0.00,-0.07,0.18,0.18,-0.23,-0.16,0.03,-0.12,-0.11,0.23
4,0.40,-0.16,0.310,-0.34,0.02,0.21,-0.70,0.11,0.08,0.05,-0.09,0.07,0.04,-0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,-0.45,-0.31,-0.260,-0.74,-0.06,0.09,-0.37,0.38,0.35,0.45,0.13,0.27,0.31,0.19
238,0.12,-0.27,0.030,0.36,1.09,0.41,0.15,0.01,-0.22,-0.01,-0.09,-0.55,-0.55,-0.49
239,1.31,0.05,-0.170,0.23,-0.25,-0.76,-0.06,-0.22,0.36,-0.17,-0.46,-0.06,0.21,-0.01
240,0.34,0.03,-0.120,-0.03,0.33,-0.23,-0.08,0.20,-0.08,0.08,-0.12,-0.14,0.12,-0.30
