# Approch: K-Nearest Neighbors


### Introduction

In this Notebook, I will be implementing KNN to predict the prices.



### Importing Libraries

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
from math import sqrt

### Loading the dataset

The data for this competition is split into three parts:

    1. Trainandvalid.csv: is the training set, which contains data through the end of 2011.
    3. Test.csv: is the test set, which won't be released until the last week of the competition. It contains data from May 1, 2012 - November 2012. Your score on the test set determines your final rank for the competition.

The key fields are in train.csv are:

    SalesID: the uniue identifier of the sale
    MachineID: the unique identifier of a machine.  A machine can be sold multiple times
    saleprice: what the machine sold for at auction (only provided in train.csv)
    saledate: the date of the sale


i will use the `pandas` library to load the CSV file but the final dataset will be stored in a `numpy` array..

In [None]:
# Load the trainandvalid.csv dataset
df = pd.read_csv('TrainAndValid.csv', low_memory=False)

# Convert columns with mixed types to string
columns_to_convert_to_string = [13, 39, 40, 41]
df.iloc[:, columns_to_convert_to_string] = df.iloc[:, columns_to_convert_to_string].astype(str)

# Drop columns with more than 50% missing values
missing_threshold = 0.3
df = df.dropna(thresh=len(df) * (1 - missing_threshold), axis=1)

# Display remaining columns after the initial cleanup
print(f"Remaining columns: {df.columns}")

# Select features (X) and target variable (y)
# Update feature_columns based on the remaining columns
feature_columns = [col for col in ['MachineID', 'ModelID', 'YearMade', 'MachineHoursCurrentMeter'] if col in df.columns]
X = df[feature_columns]
y = df['SalePrice']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




Remaining columns: Index(['SalesID', 'SalePrice', 'MachineID', 'ModelID', 'datasource',
       'auctioneerID', 'YearMade', 'saledate', 'fiModelDesc', 'fiBaseModel',
       'fiModelSeries', 'fiProductClassDesc', 'state', 'ProductGroup',
       'ProductGroupDesc', 'Enclosure', 'Hydraulics', 'Coupler_System',
       'Grouser_Tracks', 'Hydraulics_Flow'],
      dtype='object')


showing sample

In [None]:
df.sample(5)

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,saledate,fiModelDesc,fiBaseModel,fiModelSeries,fiProductClassDesc,state,ProductGroup,ProductGroupDesc,Enclosure,Hydraulics,Coupler_System,Grouser_Tracks,Hydraulics_Flow
46104,1287777,14500.0,1158972,3170,132,7.0,1990,8/31/2005 0:00,580K,580,,Backhoe Loader - 14.0 to 15.0 Ft Standard Digg...,Alabama,BL,Backhoe Loaders,OROPS,,,,
411328,6306536,36000.0,1824698,28919,149,1.0,2008,2/13/2012 0:00,WA2505L,WA250,5.0,Wheel Loader - 120.0 to 135.0 Horsepower,Florida,WL,Wheel Loader,OROPS,2 Valve,,,
301536,2244367,110000.0,964503,3377,136,1.0,1996,12/12/2008 0:00,163H,163,,Motorgrader - 170.0 to 200.0 Horsepower,Colorado,MG,Motor Graders,EROPS w AC,Base + 1 Function,,,
385314,4318917,12000.0,2294789,4579,172,1.0,2003,2/27/2009 0:00,210LE,210,,Wheel Loader - 60.0 to 80.0 Horsepower,California,WL,Wheel Loader,OROPS,2 Valve,,,
312266,2273327,17500.0,1497926,3537,136,1.0,1995,5/7/2008 0:00,416B,416,,Backhoe Loader - 14.0 to 15.0 Ft Standard Digg...,Texas,BL,Backhoe Loaders,OROPS,,,,


 Selecting features (X) and target variable (y)
 Updating feature_columns based on the remaining columns


In [None]:

feature_columns = [col for col in ['MachineID', 'ModelID', 'YearMade', 'MachineHoursCurrentMeter'] if col in df.columns]
X = df[feature_columns]
y = df['SalePrice']

# Split the dataset into training and validation sets
X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.2, random_state=42)

Imputing missing values: filling in null values with the median value and
Standardizing the features

In [None]:
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train)
X_validate_imputed = imputer.transform(X_validate)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_validate_scaled = scaler.transform(X_validate_imputed)

# Choosing the value of k (number of neighbors) that gives the best result.


In [None]:
# Choose the value of k (number of neighbors)
import numpy as np

k = 3

# Create the KNN model
knn_model = KNeighborsRegressor(n_neighbors=k)

# Train the model
knn_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = knn_model.predict(X_validate_scaled)

# Evaluate the model
mae = mean_absolute_error(y_validate, y_pred)
mse = mean_squared_error(y_validate, y_pred)
rmse = sqrt(mse)
rmsle = np.sqrt(np.mean(np.log1p(y_pred) - np.log1p(y_validate))**2)
print(f'RMSLE: {rmsle}')

print(f'K = {k}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')


RMSLE: 0.04935006971789083
K = 3
Mean Absolute Error (MAE): 10101.895426863743
Root Mean Squared Error (RMSE): 15399.862681479395


In [None]:
# Choose the value of k (number of neighbors)
k = 5

# Create the KNN model
knn_model = KNeighborsRegressor(n_neighbors=k)

# Train the model
knn_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = knn_model.predict(X_validate_scaled)

# Evaluate the model
mae = mean_absolute_error(y_validate, y_pred)
mse = mean_squared_error(y_validate, y_pred)
rmse = sqrt(mse)
#rmsle
rmsle = np.sqrt(np.mean(np.log1p(y_pred) - np.log1p(y_validate))**2)
print(f'RMSLE: {rmsle}')

print(f'K = {k}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')


RMSLE: 0.06311363442182037
K = 5
Mean Absolute Error (MAE): 9969.882064453599
Root Mean Squared Error (RMSE): 15047.853215643847


In [None]:
# Choose the value of k (number of neighbors)
k = 7

# Create the KNN model
knn_model = KNeighborsRegressor(n_neighbors=k)

# Train the model
knn_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = knn_model.predict(X_validate_scaled)

# Evaluate the model
mae = mean_absolute_error(y_validate, y_pred)
mse = mean_squared_error(y_validate, y_pred)
rmse = sqrt(mse)
rmsle = np.sqrt(np.mean(np.log1p(y_pred) - np.log1p(y_validate))**2)
print(f'RMSLE: {rmsle}')

print(f'K = {k}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')


RMSLE: 0.07108564825349002
K = 7
Mean Absolute Error (MAE): 9972.504317214163
Root Mean Squared Error (RMSE): 14964.248802372269


In [None]:
# Choose the value of k (number of neighbors)
k = 9

# Create the KNN model
knn_model = KNeighborsRegressor(n_neighbors=k)

# Train the model
knn_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = knn_model.predict(X_validate_scaled)

# Evaluate the model
mae = mean_absolute_error(y_validate, y_pred)
mse = mean_squared_error(y_validate, y_pred)
rmse = sqrt(mse)
rmsle = np.sqrt(np.mean(np.log1p(y_pred) - np.log1p(y_validate))**2)
print(f'RMSLE: {rmsle}')

print(f'K = {k}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')

RMSLE: 0.07703128806213146
K = 9
Mean Absolute Error (MAE): 10012.458487198126
Root Mean Squared Error (RMSE): 14983.85523368837


best value of k is 7; therefore, we'll make the test on k=7

In [None]:
k = 7

# Create the KNN model
knn_model = KNeighborsRegressor(n_neighbors=k)

# Train the model
knn_model.fit(X_validate_scaled, y_validate)

# Make predictions on the test set
y_pred = knn_model.predict(X_validate_scaled)

# Evaluate the model
mae = mean_absolute_error(y_validate, y_pred)
mse = mean_squared_error(y_validate, y_pred)
rmse = sqrt(mse)
rmsle = np.sqrt(np.mean(np.log1p(y_pred) - np.log1p(y_validate))**2)
print(f'RMSLE: {rmsle}')

print(f'K = {k}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')


RMSLE: 0.08685463585408258
K = 7
Mean Absolute Error (MAE): 9553.078831735262
Root Mean Squared Error (RMSE): 14144.321847453935


finally, predicting the prices on on test dataset.

In [None]:
# Load the test dataset
test_df = pd.read_csv('Test.csv')

# Select the same features used for training
test_features = test_df[feature_columns]

# Impute missing values using the same imputer used for training
test_imputed = imputer.transform(test_features)

# Standardize the features using the same scaler used for training
test_scaled = scaler.transform(test_imputed)

# Make predictions on the test set
test_predictions = knn_model.predict(test_scaled)

# Create a DataFrame to store the results
results_df = pd.DataFrame({'SalePrice_Predicted': test_predictions})

# Concatenate the results with the original test data
final_results_df = pd.concat([test_df, results_df], axis=1)

# Print the final results
print(final_results_df[['SalesID', 'SalePrice_Predicted']])

       SalesID  SalePrice_Predicted
0      1227829         22285.714286
1      1227844         19357.142857
2      1227847         45785.714286
3      1227848         81642.857143
4      1227863         32071.428571
...        ...                  ...
12452  6643171         26642.857143
12453  6643173         26642.857143
12454  6643184         20785.714286
12455  6643186         26642.857143
12456  6643196         26642.857143

[12457 rows x 2 columns]
