In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

# Assuming 'data' is your DataFrame
data = pd.read_csv('cleaned_data.csv')

# Apply your filters
data = data[(data['Latitude'] >= 44.4) & (data['Latitude'] <= 45.1)]
data = data[(data['Longitude'] >= -64) & (data['Longitude'] <= -63.2)]

# Reset index after filtering
data.reset_index(drop=True, inplace=True)

# Prepare data for imputation
columns_for_imputation = ['Longitude', 'Latitude', 'Transit Score', 'Bike Score', 'Walk Score']
data_for_imputation = data[columns_for_imputation]

# Initialize and perform the imputation
imputer = KNNImputer(n_neighbors=3)
imputed_data = imputer.fit_transform(data_for_imputation)

# Convert imputed data to DataFrame and reset index
imputed_df = pd.DataFrame(imputed_data, columns=columns_for_imputation)
imputed_df.reset_index(drop=True, inplace=True)

# Round scores and convert to integer
imputed_df['Transit Score'] = np.round(imputed_df['Transit Score']).astype(int)
imputed_df['Bike Score'] = np.round(imputed_df['Bike Score']).astype(int)
imputed_df['Walk Score'] = np.round(imputed_df['Walk Score']).astype(int)

# Assign the imputed and rounded scores back
data[columns_for_imputation] = imputed_df

# Outlier Removal
def modified_z_score(column):
    median = column.median()
    mad = np.median(np.abs(column - median))
    if mad == 0:  # Prevent division by zero
        return column * 0  # Return a zeroed series to preserve shape
    return 0.6745 * (column - median) / mad

# Apply the Modified Z-Score Method
threshold = 3.5  # Adjust based on your tolerance for outliers
cleaned_indices = set(data.index)  # Initialize with all indices

for col in data.select_dtypes(include=['float64', 'int64']).columns:
    z_scores = modified_z_score(data[col])
    # Update cleaned indices to keep rows within the threshold
    cleaned_indices = cleaned_indices.intersection(set(data.index[(z_scores < threshold) & (z_scores > -threshold)]))

# Convert set to list before filtering
cleaned_indices_list = list(cleaned_indices)

# Filter the DataFrame to only include rows with indices in cleaned_indices_list
data = data.loc[cleaned_indices_list]

# Write file for easier viewing
data.to_csv('filtered_data.csv', index=False)


# 