In [None]:
import pandas as pd   
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.impute import KNNImputer
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
df_origin = pd.read_csv("Dataset/Patial Preprocessing/Data.csv")

#### a, The mileage

In [None]:
df = df_origin.copy()
condition = (df['condition'] == 'Used car') & (df['mileage'] < 350)
df.loc[condition, 'mileage'] = np.nan

#### b, Engine_capacity

In [None]:
df.loc[(df['engine_capacity'] == 0), 'engine_capacity'] = np.nan

#### c, Fuel_consumption

In [None]:
df.loc[(df['fuel_consumption'] == 0), 'fuel_consumption'] = np.nan

### 2, Fill the null value

#### a, One hot encoding

In [None]:
df_onehot = df.drop(['ad_id', 'url', 'price'], axis=1)

In [None]:
df_onehot.shape

In [None]:
categorical_columns = ['origin', 'condition', 'car_model', 'exterior_color', 'interior_color',
                       'num_of_doors', 'engine', 'transmission', 'drive_type', 'brand', 'grade', 'car_name']

In [None]:
# Initialize the OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False, drop='first')  # drop='first' to avoid multicollinearity

# Fit and transform the categorical columns
onehot_encoded = onehot_encoder.fit_transform(df_onehot[categorical_columns])

# Create a DataFrame for the one-hot encoded columns
onehot_encoded_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out(categorical_columns))
onehot_encoded_df.shape

In [None]:
onehot_encoded_df.index = df_onehot.index

In [None]:
# Concatenate the one-hot encoded columns with the original DataFrame
df_final = pd.concat([df_onehot.drop(columns=categorical_columns), onehot_encoded_df], axis=1)

# Display the final DataFrame
df_final.shape

#### b, Standard Scaled

In [None]:
scaler = StandardScaler()

In [None]:
df_scaled = pd.DataFrame(scaler.fit_transform(df_final))

In [None]:
proximity_matrix = pairwise_distances(df_scaled, metric='nan_euclidean')
proximity_matrix = 1 / (1 + proximity_matrix)


In [None]:
df_final = df_final.reset_index(drop=True)

In [None]:
def impute_with_proximity_matrix(data, proximity_matrix, k=5):
    imputed_data = data.copy()
    print(data.shape)
    # Iterate over each column in the DataFrame with missing values
    for col_with_missing in data.columns[data.isnull().any()]:
        print(col_with_missing)
        missing_indices = data[data[col_with_missing].isna()].index
        print(missing_indices)
        for i in missing_indices:
            # Get the k nearest neighbors excluding the current point itself
            nearest_neighbors = np.argsort(-proximity_matrix[i])[:k+1]
            nearest_neighbors = nearest_neighbors[nearest_neighbors != i][:k]
            
            # Ensure the neighbors are valid indices in the DataFrame
            nearest_neighbors = [neighbor for neighbor in nearest_neighbors if neighbor in data.index]
            
            # Filter out neighbors that also have missing values
            neighbor_values = data.loc[nearest_neighbors, col_with_missing].dropna()
            if neighbor_values.empty:
                # If all nearest neighbors have missing values, fall back to the column mean
                imputed_value = data[col_with_missing].mean()
            else:
                # Compute the weighted average of the nearest neighbors' values
                valid_neighbors = neighbor_values.index
                weights = proximity_matrix[i, valid_neighbors]
                if len(neighbor_values) == 1:
                    # If only one valid neighbor, convert to array before calculating dot product
                    neighbor_values = np.array(neighbor_values)
                weighted_avg = np.dot(weights, neighbor_values) / weights.sum()
                imputed_value = weighted_avg
            imputed_data.loc[i, col_with_missing] = imputed_value

    return imputed_data

# Apply the imputation function
df_final_imputed = impute_with_proximity_matrix(df_final, proximity_matrix)


In [None]:
def replace_columns(df1, df2, columns_to_replace):
    # Make a copy of df1 to avoid modifying the original DataFrame
    df1_updated = df1.copy()
    print(df1_updated.shape)
    print(df2.shape)
    min_rows = min(df1_updated.shape[0], df2.shape[0])
    print(min_rows)
    # Iterate over each column to replace
    for col in columns_to_replace:
        # Update the column in df1 with the corresponding values from df2
        df1_updated.loc[:min_rows-1, col] = df1_updated.loc[:min_rows-1, col].where(df2.loc[:min_rows-1, col].isnull(), df2.loc[:min_rows-1, col])
       
    return df1_updated

columns_to_replace = ['mileage', 'engine_capacity', 'fuel_consumption']
df_updated = replace_columns(df, df_final_imputed, columns_to_replace)

In [None]:
df_updated.to_csv("Dataset/Final/Remove null, and fill null.csv", index = False)