In [37]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv
from torch_geometric.data import Data
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from neo4j import GraphDatabase
import gc

In [39]:
# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Clear memory
gc.collect()

# Kết nối tới Neo4j
class Neo4jConnection:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
    
    def close(self):
        self.driver.close()
    
    def run_query(self, query, parameters=None):
        with self.driver.session() as session:
            session.run(query, parameters)
    
    def get_nodes(self):
        with self.driver.session() as session:
            result = session.run("""
                MATCH (h:House)
                RETURN h.house_id AS house_id,
                       h.carpet_area AS carpet_area,
                       h.super_area AS super_area,
                       h.bathroom AS bathroom,
                       h.balcony AS balcony,
                       h.current_floor AS current_floor,
                       h.total_floors AS total_floors,
                       h.bhk AS bhk,
                       h.price AS price,
                       h.car_parking AS car_parking,
                       h.price_x_super_area AS price_x_super_area,
                       h.amount AS amount,
                       h.transaction AS transaction,
                       h.furnishing AS furnishing,
                       h.overlooking AS overlooking,
                       h.ownership AS ownership,
                       h.facing AS facing
            """)
            return pd.DataFrame([record.values() for record in result], columns=result.keys())
    
    def get_edges(self, relationship_type):
        with self.driver.session() as session:
            result = session.run(f"""
                MATCH (h1:House)-[r:{relationship_type}]->(h2:House)
                RETURN h1.house_id AS source, h2.house_id AS target, r.weight AS weight
            """)
            return pd.DataFrame([record.values() for record in result], columns=result.keys())

In [41]:
# Đọc dữ liệu
train_df = pd.read_csv('data2/train.csv')
test_df = pd.read_csv('data2/test.csv')

# Feature engineering: Add Price * Super Area
train_df['Price_x_SuperArea'] = train_df['Price'] * train_df['Super Area']
test_df['Price_x_SuperArea'] = test_df['Price'] * test_df['Super Area']

# Thêm cột index làm house_id
train_df['index'] = train_df.index
test_df['index'] = test_df.index

In [45]:
# (Tùy chọn) Giảm số lượng node để kiểm tra
# train_df = train_df.head(5000)

# Kết nối tới Neo4j
neo4j_conn = Neo4jConnection("bolt://localhost:7687", "neo4j", "hqiineo4j")  # Thay bằng thông tin của bạn

# Xóa dữ liệu cũ (nếu cần)
neo4j_conn.run_query("MATCH (n) DETACH DELETE n")

# Tạo node House
print("Creating House nodes...")
for _, row in train_df.iterrows():
    query = """
        CREATE (h:House {
            house_id: $house_id,
            carpet_area: $carpet_area,
            super_area: $super_area,
            bathroom: $bathroom,
            balcony: $balcony,
            current_floor: $current_floor,
            total_floors: $total_floors,
            bhk: $bhk,
            price: $price,
            car_parking: $car_parking,
            price_x_super_area: $price_x_super_area,
            amount: $amount,
            transaction: $transaction,
            furnishing: $furnishing,
            overlooking: $overlooking,
            society: $society,
            ownership: $ownership,
            facing: $facing,
            location: $location
        })
    """
    parameters = {
        "house_id": int(row['index']),
        "carpet_area": float(row['Carpet Area']),
        "super_area": float(row['Super Area']),
        "bathroom": int(row['Bathroom']),
        "balcony": int(row['Balcony']),
        "current_floor": int(row['Current Floor']),
        "total_floors": int(row['Total Floors']),
        "bhk": int(row['BHK']),
        "price": float(row['Price']),
        "car_parking": int(row['Car Parking']),
        "price_x_super_area": float(row['Price_x_SuperArea']),
        "amount": float(row['Amount']),
        "transaction": row['Transaction'],
        "furnishing": row['Furnishing'],
        "overlooking": row['Overlooking'],
        "society": row['Society'],
        "ownership": row['Ownership'],
        "facing": row['Facing'],
        "location": row['Location']
    }
    neo4j_conn.run_query(query, parameters)

Creating House nodes...


In [46]:
# Tạo node Society
print("Creating Society nodes...")
societies = train_df['Society'].unique()
for society in societies:
    query = """
        MERGE (s:Society {name: $name})
    """
    neo4j_conn.run_query(query, {"name": society})

Creating Society nodes...


In [47]:
# Tạo node Location
print("Creating Location nodes...")
locations = train_df['Location'].unique()
for location in locations:
    query = """
        MERGE (l:Location {name: $name})
    """
    neo4j_conn.run_query(query, {"name": location})

Creating Location nodes...


In [48]:
# Tạo relationship BELONGS_TO_SOCIETY
print("Creating BELONGS_TO_SOCIETY relationships...")
for _, row in train_df.iterrows():
    query = """
        MATCH (h:House {house_id: $house_id})
        MATCH (s:Society {name: $society})
        CREATE (h)-[:BELONGS_TO_SOCIETY]->(s)
    """
    parameters = {
        "house_id": int(row['index']),
        "society": row['Society']
    }
    neo4j_conn.run_query(query, parameters)

Creating BELONGS_TO_SOCIETY relationships...


In [49]:
# Tạo relationship BELONGS_TO_LOCATION
print("Creating BELONGS_TO_LOCATION relationships...")
for _, row in train_df.iterrows():
    query = """
        MATCH (h:House {house_id: $house_id})
        MATCH (l:Location {name: $location})
        CREATE (h)-[:BELONGS_TO_LOCATION]->(l)
    """
    parameters = {
        "house_id": int(row['index']),
        "location": row['Location']
    }
    neo4j_conn.run_query(query, parameters)

Creating BELONGS_TO_LOCATION relationships...


In [61]:
# Tạo relationship SIMILAR_BHK (chia nhỏ theo BHK)
print("Creating SIMILAR_BHK relationships...")
bhk_values = train_df['BHK'].unique()
for bhk in bhk_values:
    print(f"Processing BHK={bhk}...")
    query_similar_bhk = """
        MATCH (h1:House {bhk: $bhk})
        MATCH (h2:House {bhk: $bhk})
        WHERE h1.house_id < h2.house_id
        WITH h1, h2,
             abs(h1.amount - h2.amount) / (h1.amount + h2.amount + 0.00001) AS price_similarity
        WHERE price_similarity < 0.15
        CREATE (h1)-[:SIMILAR_BHK {weight: 1.0 - price_similarity}]->(h2)
    """
    neo4j_conn.run_query(query_similar_bhk, {"bhk": int(bhk)})

Creating SIMILAR_BHK relationships...
Processing BHK=3...


TransientError: {code: Neo.TransientError.General.MemoryPoolOutOfMemoryError} {message: The allocation of an extra 48,0 MiB would use more than the limit 716,8 MiB. Currently using 684,0 MiB. dbms.memory.transaction.total.max threshold reached}

In [None]:
# Tạo relationship SIMILAR_PRICE (chia nhỏ theo khoảng giá)
print("Creating SIMILAR_PRICE relationships...")
amount_bins = pd.qcut(train_df['Amount'], q=10, duplicates='drop').cat.categories
for i in range(len(amount_bins)):
    min_amount = amount_bins[i].left
    max_amount = amount_bins[i].right
    print(f"Processing Amount range: {min_amount} to {max_amount}...")
    query_similar_price = """
        MATCH (h1:House)
        MATCH (h2:House)
        WHERE h1.amount >= $min_amount AND h1.amount <= $max_amount
          AND h2.amount >= $min_amount AND h2.amount <= $max_amount
          AND h1.house_id < h2.house_id
        WITH h1, h2,
             abs(h1.amount - h2.amount) / (h1.amount + h2.amount + 0.00001) AS price_similarity
        WHERE price_similarity < 0.1
        CREATE (h1)-[:SIMILAR_PRICE {weight: 1.0 - price_similarity}]->(h2)
    """
    neo4j_conn.run_query(query_similar_price, {"min_amount": float(min_amount), "max_amount": float(max_amount)})

In [None]:
# Trích xuất dữ liệu từ Neo4j
print("Extracting data from Neo4j...")
nodes_df = neo4j_conn.get_nodes()
similar_bhk_edges = neo4j_conn.get_edges("SIMILAR_BHK")
similar_price_edges = neo4j_conn.get_edges("SIMILAR_PRICE")

# Đóng kết nối Neo4j
neo4j_conn.close()

In [None]:
# Kết hợp các cạnh
edges_df = pd.concat([similar_bhk_edges, similar_price_edges], ignore_index=True)

# Tạo đặc trưng
numerical_cols = ['carpet_area', 'super_area', 'bathroom', 'balcony', 'current_floor', 
                 'total_floors', 'bhk', 'price', 'car_parking', 'price_x_super_area']
categorical_cols = ['transaction', 'furnishing', 'overlooking', 'ownership', 'facing']

scaler = MinMaxScaler()
numerical_features = scaler.fit_transform(nodes_df[numerical_cols])

encoded_features = []
encoders = {}
for col in categorical_cols:
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded = encoder.fit_transform(nodes_df[[col]])
    encoded_df = pd.DataFrame(encoded, columns=[f"{col}_{cat}" for cat in encoder.categories_[0]])
    encoded_features.append(encoded_df)
    encoders[col] = encoder

In [None]:
features = pd.concat([pd.DataFrame(numerical_features, columns=numerical_cols)] + encoded_features, axis=1)
y = np.log1p(nodes_df['amount'].values)

# Tạo edge_index và edge_weight
edge_index = torch.tensor(edges_df[['source', 'target']].values.T, dtype=torch.long)
edge_weight = torch.tensor(edges_df['weight'].values, dtype=torch.float)

# Tạo dữ liệu cho PyTorch Geometric
data = Data(
    x=torch.tensor(features.values, dtype=torch.float),
    edge_index=edge_index,
    edge_attr=edge_weight,
    y=torch.tensor(y, dtype=torch.float)
)

In [None]:
# Kiểm tra số lượng đỉnh và cạnh
print(f"Graph: {data.num_nodes} nodes, {data.num_edges} edges")

# GNN Model with GATConv
class GNNModel(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim=64):
        super(GNNModel, self).__init__()
        self.conv1 = GATConv(input_dim, hidden_dim, heads=2, concat=True)
        self.bn1 = torch.nn.BatchNorm1d(hidden_dim * 2)
        self.conv2 = GATConv(hidden_dim * 2, 32, heads=2, concat=True)
        self.bn2 = torch.nn.BatchNorm1d(32 * 2)
        self.conv3 = GATConv(32 * 2, 16, heads=1, concat=False)
        self.bn3 = torch.nn.BatchNorm1d(16)
        self.fc = torch.nn.Linear(16, 1)
    
    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr
        x = self.conv1(x, edge_index, edge_attr=edge_weight)
        x = self.bn1(x)
        x = F.relu(x)
        x = F.dropout(x, p=0.3, training=self.training)
        x = self.conv2(x, edge_index, edge_attr=edge_weight)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.conv3(x, edge_index, edge_attr=edge_weight)
        x = self.bn3(x)
        x = F.relu(x)
        x = self.fc(x)
        return x.squeeze()

In [None]:
# Training on CPU
device = torch.device('cpu')
model = GNNModel(input_dim=features.shape[1]).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=10, factor=0.5)

data = data.to(device)

In [None]:
model.train()
for epoch in range(1500):
    optimizer.zero_grad()
    out = model(data)
    mse_loss = F.mse_loss(out, data.y)
    mae_loss = F.l1_loss(out, data.y)
    mse_weight = min(0.8, 0.2 + epoch / 2500)
    mae_weight = 1 - mse_weight
    loss = mse_weight * mse_loss + mae_weight * mae_loss
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step(loss)
    if (epoch + 1) % 50 == 0:
        print(f'Epoch {epoch + 1}, Loss: {loss.item():.4f}, MSE Weight: {mse_weight:.2f}')
        with torch.no_grad():
            sample_pred = out[:5].cpu().numpy()
            sample_true = data.y[:5].cpu().numpy()
            print(f"Sample Predictions: {sample_pred}")
            print(f"Sample True Values: {sample_true}")

In [None]:
# Evaluation (dùng tập test từ file)
def preprocess_data(df, numerical_cols, categorical_cols, target_col, 
                   scaler=None, encoders=None, target_encoders=None, target_scaler=None, is_train=True):
    df = df.copy()
    
    target_encode_cols = ['Society', 'Location', 'Overlooking']
    
    if is_train:
        scaler = MinMaxScaler()
        df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    else:
        df[numerical_cols] = scaler.transform(df[numerical_cols])
    
    encoded_features = []
    if is_train:
        encoders = {}
        target_encoders = {}
        target_scaler = MinMaxScaler()
        
        one_hot_cols = ['Transaction', 'Furnishing', 'Ownership', 'Facing']
        for col in one_hot_cols:
            encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
            encoded = encoder.fit_transform(df[[col]])
            encoded_df = pd.DataFrame(
                encoded, 
                columns=[f"{col}_{cat}" for cat in encoder.categories_[0]]
            )
            encoded_features.append(encoded_df)
            encoders[col] = encoder
        
        target_encoded = []
        for col in target_encode_cols:
            mean_target = df.groupby(col)[target_col].mean()
            df[f'{col}_encoded'] = df[col].map(mean_target)
            target_encoders[col] = mean_target
            target_encoded.append(df[[f'{col}_encoded']])
        
        target_encoded_df = pd.concat(target_encoded, axis=1)
        scaled_target_encoded = target_scaler.fit_transform(target_encoded_df)
        for i, col in enumerate(target_encode_cols):
            df[f'{col}_encoded'] = scaled_target_encoded[:, i]
            encoded_features.append(df[[f'{col}_encoded']])
    else:
        for col in ['Transaction', 'Furnishing', 'Ownership', 'Facing']:
            encoded = encoders[col].transform(df[[col]])
            encoded_df = pd.DataFrame(
                encoded, 
                columns=[f"{col}_{cat}" for cat in encoders[col].categories_[0]]
            )
            encoded_features.append(encoded_df)
        
        target_encoded = []
        for col in target_encode_cols:
            default_value = df[target_col].mean() if target_col in df else 0
            df[f'{col}_encoded'] = df[col].map(target_encoders.get(col, pd.Series())).fillna(default_value)
            target_encoded.append(df[[f'{col}_encoded']])
        
        target_encoded_df = pd.concat(target_encoded, axis=1)
        scaled_target_encoded = target_scaler.transform(target_encoded_df)
        for i, col in enumerate(target_encode_cols):
            df[f'{col}_encoded'] = scaled_target_encoded[:, i]
            encoded_features.append(df[[f'{col}_encoded']])
    
    feature_df = pd.concat([df[numerical_cols]] + encoded_features, axis=1)
    y = np.log1p(df[target_col].values) if target_col in df else None
    
    return feature_df, y, scaler, encoders, target_encoders, target_scaler


In [None]:
def create_edges(df, feature_df, k=15, sim_threshold=0.6, key_cols=['Society', 'Location', 'BHK']):
    from sklearn.neighbors import NearestNeighbors
    features = feature_df.values
    knn = NearestNeighbors(n_neighbors=min(k + 1, len(df)), metric='cosine', n_jobs=-1)
    knn.fit(features)
    distances, indices = knn.kneighbors(features)
    
    edge_index = []
    edge_weight = []
    
    num_nodes = len(df)
    for i in range(len(df)):
        for j_idx, dist in zip(indices[i][1:], distances[i][1:]):
            if j_idx >= num_nodes:
                continue
            sim = 1 - dist
            if sim > sim_threshold and any(df.iloc[i][col] == df.iloc[j_idx][col] for col in key_cols):
                edge_index.append([i, j_idx])
                edge_index.append([j_idx, i])
                edge_weight.append(sim)
                edge_weight.append(sim)
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_weight = torch.tensor(edge_weight, dtype=torch.float)
    return edge_index, edge_weight

# Preprocess test data
numerical_cols_input = ['Carpet Area', 'Super Area', 'Bathroom', 'Balcony', 'Current Floor', 
                        'Total Floors', 'BHK', 'Price', 'Car Parking', 'Price_x_SuperArea']
categorical_cols_input = ['Transaction', 'Furnishing', 'Overlooking', 'Ownership', 'Facing']
target_col = 'Amount'

test_features, test_y, _, _, _, _ = preprocess_data(
    test_df, numerical_cols_input, categorical_cols_input, target_col, 
    scaler=scaler, encoders=encoders, target_encoders=None, target_scaler=None, is_train=False
)

# Tạo đồ thị cho test (dùng phương pháp trước đó)
test_edge_index, test_edge_weight = create_edges(test_df, test_features, k=15, sim_threshold=0.6)
test_data = Data(
    x=torch.tensor(test_features.values, dtype=torch.float),
    edge_index=test_edge_index,
    edge_attr=test_edge_weight,
    y=torch.tensor(test_y, dtype=torch.float)
).to(device)

# Đánh giá mô hình
model.eval()
with torch.no_grad():
    pred = model(test_data)
    pred = np.expm1(pred.cpu().numpy())
    true = np.expm1(test_data.y.cpu().numpy())
    
    mae = mean_absolute_error(true, pred)
    mse = mean_squared_error(true, pred)
    r2 = r2_score(true, pred)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((true - pred) / true)) * 100
    
    print("\nGNN Test Metrics:")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"R2 Score: {r2:.4f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAPE: {mape:.2f}%")

# Save model
torch.save(model.state_dict(), 'gnn_model_with_neo4j_fixed_memory.pth')

# Clear memory
gc.collect()