# import

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


# load data

In [None]:
# Load the dataset from the CSV file
df = pd.read_csv('dataset.csv')

# Display the first few rows to understand the data
print("Initial data preview:")
print(df.head())

# Drop the "Address" column since it is not used in our prediction model
df = df.drop(columns=['Address'])


# Quick information on the dataframe structure
print("\nDataframe info:")
df.info()


# Step 2: Data Preprocessing and Feature Engineering for predicting UnitPrice


In [None]:


# Load the dataset
df = pd.read_csv('dataset.csv')

# Remove leading and trailing whitespace from column names
df.columns = df.columns.str.strip()
print("Columns after stripping:", df.columns.tolist())
# Filter the data to keep only rows where Primary Use equals 3
df = df[df['Primary Use'] == 3]
print("Shape after filtering to Primary Use == 3:", df.shape)


# --- Drop rows where the 'Floors' column contains a comma ---
# This filters out rows with values like "1,2,3" in the 'Floors' column.
df = df[~df['Floors'].astype(str).str.contains(',')]
print("Shape after dropping rows with comma in 'Floors':", df.shape)

# --- Replace Blanks and Placeholders ---
df.replace({'--': np.nan, '': np.nan}, inplace=True)
df.dropna(inplace=True)

# --- Convert ROC Date to Gregorian Timestamp ---
def convert_roc_date(roc_date_str):
    try:
        parts = roc_date_str.strip().split('/')
        if len(parts) == 3:
            roc_year = int(parts[0].strip())
            month = int(parts[1].strip())
            day = int(parts[2].strip())
            if month < 1 or month > 12:
                raise ValueError(f"Month {month} out of valid range in date: {roc_date_str}")
            gregorian_year = roc_year + 1911
            return pd.Timestamp(year=gregorian_year, month=month, day=day)
        else:
            print(f"Unexpected format for date: {roc_date_str}")
            return pd.NaT
    except Exception as e:
        print(f"Error converting date {roc_date_str}: {e}")
        return pd.NaT

# Convert the Date column
df['Date'] = df['Date'].apply(convert_roc_date)
print("Converted dates:")
print(df['Date'].head())

# --- Sort DataFrame by Date ---
df.sort_values('Date', inplace=True)

# For BuildingType and Primary Use, convert to numeric if necessary.
df['BuildingType'] = pd.to_numeric(df['BuildingType'], errors='coerce')
df['Primary Use'] = pd.to_numeric(df['Primary Use'], errors='coerce')

print("\nMissing values after cleaning numeric columns:")
print(df[['MainBuildingRatio', 'BuildingType', 'Primary Use']].isnull().sum())

# --- Identify Features and Target ---
# Drop Address (non-numeric) and Date (used for ordering) along with target UnitPrice
features = [col for col in df.columns if col not in ['Address', 'UnitPrice', 'Date']]
target = 'UnitPrice'

# --- Scaling the Features and Target ---
scaler_features = MinMaxScaler()
scaler_target = MinMaxScaler()

df_features_scaled = scaler_features.fit_transform(df[features])
df_target_scaled = scaler_target.fit_transform(df[[target]])

print("\nScaled features shape:", df_features_scaled.shape)
print("Scaled target shape:", df_target_scaled.shape)

# --- Create Sequences for LSTM Input ---
def create_sequences(features_data, target_data, window_size=5):
    X, y = [], []
    for i in range(len(features_data) - window_size):
        X.append(features_data[i:(i + window_size)])
        y.append(target_data[i + window_size])
    return np.array(X), np.array(y)

window_size = 5
X, y = create_sequences(df_features_scaled, df_target_scaled, window_size=window_size)

print("\nInput sequence shape (samples, timesteps, features):", X.shape)
print("Output sequence shape (samples, target dimension):", y.shape)

# One-hot encode the BuildingType and Primary Use columns
df = pd.get_dummies(df, columns=['BuildingType', 'Primary Use'], prefix=['BT', 'PU'])

# After encoding, verify the new columns
print("Columns after one-hot encoding:", df.columns.tolist())


# LSTM model construct


In [None]:


# Define the shape of the input data
input_shape = (X.shape[1], X.shape[2])  # (timesteps, features)
print("Input shape to the model:", input_shape)

# Build the LSTM Model
model = Sequential()
# First LSTM layer, returns sequences for the next LSTM layer.
model.add(LSTM(units=50, activation='tanh', return_sequences=True, input_shape=input_shape))
model.add(Dropout(0.2))
# Second LSTM layer; no need to return sequences since it's the last LSTM.
model.add(LSTM(units=50, activation='tanh'))
model.add(Dropout(0.2))
# Final Dense layer for regression output
model.add(Dense(units=1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Display model summary
model.summary()


# Train

In [None]:


# --- Step 4: Train the LSTM Model ---

# Split the dataset into training and validation sets.
# For time series data, it's important not to shuffle to maintain the temporal relationship.
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)
print("Training set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)

# Define an EarlyStopping callback to prevent overfitting.
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model.
# Adjust epochs and batch_size as needed.
history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=64,
    validation_data=(X_val, y_val),
    callbacks=[early_stop]
)


# Evaluate the Model

In [None]:

# --- Plot Training and Validation Loss ---
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.title('Training and Validation Loss Over Epochs')
plt.legend()
plt.show()

# --- Evaluate the Model on the Validation Set ---
val_loss = model.evaluate(X_val, y_val, verbose=0)
print("Validation Loss (MSE):", val_loss)

# --- Make Predictions on the Validation Set ---
y_pred_scaled = model.predict(X_val)

# Inverse transform the scaled predictions and validation targets to their original scale
y_pred = scaler_target.inverse_transform(y_pred_scaled)
y_actual = scaler_target.inverse_transform(y_val)

# --- Plot Actual vs. Predicted UnitPrice ---
plt.figure(figsize=(12, 6))
plt.plot(y_actual, label='Actual UnitPrice', marker='o', linestyle='-', markersize=3)
plt.plot(y_pred, label='Predicted UnitPrice', marker='x', linestyle='--', markersize=3)
plt.xlabel('Sample Index')
plt.ylabel('UnitPrice')
plt.title('Actual vs. Predicted UnitPrice on Validation Set')
plt.legend()
plt.show()


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Compute Mean Squared Error (MSE)
mse_val = mean_squared_error(y_actual, y_pred)
# Compute Mean Absolute Error (MAE)
mae_val = mean_absolute_error(y_actual, y_pred)
# Compute R² Score (coefficient of determination)
r2_val = r2_score(y_actual, y_pred)

print("Validation Metrics:")
print(f"Mean Squared Error (MSE): {mse_val:.3f}")
print(f"Mean Absolute Error (MAE): {mae_val:.3f}")
print(f"R² Score: {r2_val:.3f}")
