# Preprocessing

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### Step 1:
- Read the boston house dataset

- Create train and test datasets. 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split


# Load the dataset
boston_data = pd.read_csv('boston_house_prices.csv')


# Load the Boston housing dataset
X = pd.DataFrame(boston_data.data, columns=boston_data.feature_names)
y = pd.Series(boston_data.target)

# Create train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print shapes of the datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

### Step 2:
Create a scatter plot of each attribute with the mean house price.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the mean house price
mean_price = y.mean()

# Create scatter plots for each feature against the mean house price
plt.figure(figsize=(15, 10))

for i, column in enumerate(X.columns):
    plt.subplot(4, 4, i + 1)  # Adjusting the subplot layout
    plt.scatter(X[column], y, alpha=0.5)
    plt.axhline(y=mean_price, color='r', linestyle='--')  # Line for mean price
    plt.title(f'Scatter Plot of {column} vs House Price')
    plt.xlabel(column)
    plt.ylabel('House Price')
    
plt.tight_layout()
plt.show()

### Step 3:
Create a box plot of all attributes.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the figure size
plt.figure(figsize=(15, 10))

# Create a box plot for each attribute
sns.boxplot(data=X)
plt.title('Box Plot of Boston Housing Attributes')
plt.xticks(rotation=45)
plt.xlabel('Attributes')
plt.ylabel('Value')

# Show the plot
plt.tight_layout()
plt.show()

### Step 4:
Use `StandardScaler()` to scale the trainin data set.

In [None]:
from sklearn.preprocessing import StandardScaler

# Instantiate the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Optionally, transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test)

# Print the shapes of the scaled datasets
print("Scaled X_train shape:", X_train_scaled.shape)
print("Scaled X_test shape:", X_test_scaled.shape)

### Step 5:
Use `KNeighborsRegressor()` to fit both not-scaled and scaled datasets. Check the scores of test dataset for both models.

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# 1. Fit the model on the not-scaled dataset
knn_not_scaled = KNeighborsRegressor(n_neighbors=5)
knn_not_scaled.fit(X_train, y_train)

# 2. Predict and calculate the score on the test dataset (not scaled)
y_pred_not_scaled = knn_not_scaled.predict(X_test)
score_not_scaled = knn_not_scaled.score(X_test, y_test)
mse_not_scaled = mean_squared_error(y_test, y_pred_not_scaled)

# 3. Fit the model on the scaled dataset
knn_scaled = KNeighborsRegressor(n_neighbors=5)
knn_scaled.fit(X_train_scaled, y_train)

# 4. Predict and calculate the score on the test dataset (scaled)
y_pred_scaled = knn_scaled.predict(X_test_scaled)
score_scaled = knn_scaled.score(X_test_scaled, y_test)
mse_scaled = mean_squared_error(y_test, y_pred_scaled)

# 5. Print the scores and Mean Squared Errors
print("Test score (Not Scaled):", score_not_scaled)
print("Mean Squared Error (Not Scaled):", mse_not_scaled)

print("Test score (Scaled):", score_scaled)
print("Mean Squared Error (Scaled):", mse_scaled)

### Step 6:
Repeat Step 5 for `RandomForestRegressor`. Use

`RandomForestRegressor(n_estimators=100, random_state=0)`

In [None]:
from sklearn.ensemble import RandomForestRegressor

# 1. Fit the model on the not-scaled dataset
rf_not_scaled = RandomForestRegressor(n_estimators=100, random_state=0)
rf_not_scaled.fit(X_train, y_train)

# 2. Predict and calculate the score on the test dataset (not scaled)
y_pred_rf_not_scaled = rf_not_scaled.predict(X_test)
score_rf_not_scaled = rf_not_scaled.score(X_test, y_test)
mse_rf_not_scaled = mean_squared_error(y_test, y_pred_rf_not_scaled)

# 3. Fit the model on the scaled dataset
rf_scaled = RandomForestRegressor(n_estimators=100, random_state=0)
rf_scaled.fit(X_train_scaled, y_train)

# 4. Predict and calculate the score on the test dataset (scaled)
y_pred_rf_scaled = rf_scaled.predict(X_test_scaled)
score_rf_scaled = rf_scaled.score(X_test_scaled, y_test)
mse_rf_scaled = mean_squared_error(y_test, y_pred_rf_scaled)

# 5. Print the scores and Mean Squared Errors
print("Test score (Not Scaled):", score_rf_not_scaled)
print("Mean Squared Error (Not Scaled):", mse_rf_not_scaled)

print("Test score (Scaled):", score_rf_scaled)
print("Mean Squared Error (Scaled):", mse_rf_scaled)
