In [1]:
!nvidia-smi



Fri Jul 11 13:18:36 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   48C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py


Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 596, done.[K
remote: Counting objects: 100% (162/162), done.[K
remote: Compressing objects: 100% (80/80), done.[K
remote: Total 596 (delta 128), reused 82 (delta 82), pack-reused 434 (from 3)[K
Receiving objects: 100% (596/596), 195.77 KiB | 7.53 MiB/s, done.
Resolving deltas: 100% (302/302), done.
Installing RAPIDS remaining 25.04 libraries
Using Python 3.11.13 environment at: /usr
Resolved 173 packages in 1.76s
Downloading cucim-cu12 (5.6MiB)
Downloading cuml-cu12 (9.4MiB)
Downloading libcuvs-cu12 (1.1GiB)
Downloading pylibcudf-cu12 (26.4MiB)
Downloading datashader (17.5MiB)
Downloading dask (1.3MiB)
Downloading shapely (2.4MiB)
Downloading bokeh (6.6MiB)
Downloading raft-dask-cu12 (274.9MiB)
Downloading cudf-cu12 (1.7MiB)
Downloading cugraph-cu12 (3.0MiB)
Downloading rmm-cu12 (1.5MiB)
Downloading ucx-py-cu12 (2.2MiB)
Downloading libcuspatial-cu12 (31.1MiB)
Downloading librmm-cu12 (2.9MiB)
Downloading libcudf-cu12 

Q1. Data Preprocessing with cuDF vs panda
a Load the dataset using both pandas and cudf. Record the load times.
b Perform the following tasks with both libraries:
• Display .info(), .describe()
• Drop any rows with nulls (if any)
• Convert the price column to log scale
• Filter records for the last 5 years (based on date_of_transfer)
c Compare the execution time for each task in pandas vs cudf.

In [3]:
import pandas as pd
import cudf
import time


In [4]:
file_path = '/content/drive/MyDrive/price_paid_records.csv'


In [5]:
# Define column names
columns = [
    "Transaction unique identifier", "Price", "Date of Transfer", "Property Type",
    "Old/New", "Duration", "Town/City", "District", "County",
    "PPDCategory Type", "Record Status - monthly file only"
]

In [7]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [11]:
file_path = '/content/drive/MyDrive/price_paid_records.csv'


In [12]:
import os
for f in os.listdir('/content'):
    print(f)


.config
rapidsai-csp-utils
drive
sample_data


In [13]:
# 1️⃣ Load CSV
start = time.time()
pdf = pd.read_csv(file_path, names=columns, header=0)
pandas_load_time = time.time() - start

start = time.time()
gdf = cudf.read_csv(file_path, names=columns, header=0)
cudf_load_time = time.time() - start

In [14]:
print("Load Time:")
print(f"Pandas: {pandas_load_time:.3f}s")
print(f"cuDF:   {cudf_load_time:.3f}s\n")

Load Time:
Pandas: 72.974s
cuDF:   13.041s



In [15]:
# info() and describe()
start = time.time()
_ = pdf.info()
_ = pdf.describe()
pandas_info_time = time.time() - start

start = time.time()
_ = gdf.info()
_ = gdf.describe()
cudf_info_time = time.time() - start

print("📋 Info & Describe Time:")
print(f"Pandas: {pandas_info_time:.3f}s")
print(f"cuDF:   {cudf_info_time:.3f}s\n")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22489348 entries, 0 to 22489347
Data columns (total 11 columns):
 #   Column                             Dtype 
---  ------                             ----- 
 0   Transaction unique identifier      object
 1   Price                              int64 
 2   Date of Transfer                   object
 3   Property Type                      object
 4   Old/New                            object
 5   Duration                           object
 6   Town/City                          object
 7   District                           object
 8   County                             object
 9   PPDCategory Type                   object
 10  Record Status - monthly file only  object
dtypes: int64(1), object(10)
memory usage: 1.8+ GB
<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 22489348 entries, 0 to 22489347
Data columns (total 11 columns):
 #   Column                             Dtype
---  ------                             -----
 0   Transacti

In [16]:
# Drop nulls
start = time.time()
pdf_clean = pdf.dropna()
pandas_dropna_time = time.time() - start

start = time.time()
gdf_clean = gdf.dropna()
cudf_dropna_time = time.time() - start

print("🧹 Drop NA Time:")
print(f"Pandas: {pandas_dropna_time:.3f}s")
print(f"cuDF:   {cudf_dropna_time:.3f}s\n")

🧹 Drop NA Time:
Pandas: 13.572s
cuDF:   0.066s



In [18]:
import numpy as np
import cupy as cp
#Log transform
start = time.time()
pdf_clean["log_price"] = np.log1p(pdf_clean["Price"])
pandas_log_time = time.time() - start

start = time.time()
gdf_clean["log_price"] = cp.log1p(gdf_clean["Price"])
cudf_log_time = time.time() - start

print("Log Transform Time:")
print(f"Pandas: {pandas_log_time:.3f}s")
print(f"cuDF:   {cudf_log_time:.3f}s\n")

Log Transform Time:
Pandas: 0.436s
cuDF:   0.692s



In [20]:
# ✅ Set the cutoff date (using pandas datetime works fine)
cutoff_date = pd.to_datetime("2019-01-01")

# 🕒 Pandas Filtering
start = time.time()
pdf_clean["Date of Transfer"] = pd.to_datetime(pdf_clean["Date of Transfer"], errors="coerce")
pdf_recent = pdf_clean[pdf_clean["Date of Transfer"] >= cutoff_date]
pandas_filter_time = time.time() - start

# 🕒 cuDF Filtering
start = time.time()
gdf_clean["Date of Transfer"] = cudf.to_datetime(gdf_clean["Date of Transfer"])  # removed errors='coerce'
gdf_recent = gdf_clean[gdf_clean["Date of Transfer"] >= cutoff_date]
cudf_filter_time = time.time() - start

# 🖨️ Output timings
print("📅 Date Filter Time:")
print(f"Pandas: {pandas_filter_time:.3f}s")
print(f"cuDF:   {cudf_filter_time:.3f}s\n")


📅 Date Filter Time:
Pandas: 1.064s
cuDF:   0.303s



In [21]:
print("📊 Final Time Comparison Summary (seconds):")
print(f"{'Task':30} {'Pandas':>10} {'cuDF':>10}")
print(f"{'-'*50}")
print(f"{'Load CSV':30} {pandas_load_time:10.3f} {cudf_load_time:10.3f}")
print(f"{'Info & Describe':30} {pandas_info_time:10.3f} {cudf_info_time:10.3f}")
print(f"{'Drop NA':30} {pandas_dropna_time:10.3f} {cudf_dropna_time:10.3f}")
print(f"{'Log Transform':30} {pandas_log_time:10.3f} {cudf_log_time:10.3f}")
print(f"{'Date Filter (Last 5 Years)':30} {pandas_filter_time:10.3f} {cudf_filter_time:10.3f}")

📊 Final Time Comparison Summary (seconds):
Task                               Pandas       cuDF
--------------------------------------------------
Load CSV                           72.974     13.041
Info & Describe                     1.976      0.609
Drop NA                            13.572      0.066
Log Transform                       0.436      0.692
Date Filter (Last 5 Years)          1.064      0.303


Q2: Predicting Price Category with cuML vs Scikit-learn
a) Preprocess data to create features for ML:
• Convert categorical fields (e.g., property_type, town, county) using label
encoding
• Create a target variable:
• price_category = 1 if price > median_price else 0 (binary classification)
b) Train a classification model (e.g., RandomForestClassifier) using:
• scikit-learn
• cuML
c) Compare:
• Model training time
• Accuracy or F1-score
• Prediction time

In [22]:
import pandas as pd
import cudf
import numpy as np
import cupy as cp
from cuml.preprocessing import LabelEncoder as cuLabelEncoder
from cuml.ensemble import RandomForestClassifier as cuRF
from sklearn.ensemble import RandomForestClassifier as skRF
from sklearn.preprocessing import LabelEncoder as skLabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import time


In [24]:
columns = [
    "Transaction unique identifier", "Price", "Date of Transfer", "Property Type",
    "Old/New", "Duration", "Town/City", "District", "County",
    "PPDCategory Type", "Record Status - monthly file only"
]

file_path = '/content/drive/MyDrive/price_paid_records.csv'  # Adjust path

# Load 1000 rows
pdf = pd.read_csv(file_path, names=columns, header=0, nrows=1000)
gdf = cudf.read_csv(file_path, names=columns, header=0, nrows=1000)


In [25]:
# Choose same features
features = ["Price", "Property Type", "Town/City", "County"]

# Pandas version
pdf = pdf[features].dropna()
median_price = pdf["Price"].median()
pdf["price_category"] = (pdf["Price"] > median_price).astype(int)

# Label Encoding with sklearn
for col in ["Property Type", "Town/City", "County"]:
    le = skLabelEncoder()
    pdf[col] = le.fit_transform(pdf[col])

X_pd = pdf.drop("price_category", axis=1)
y_pd = pdf["price_category"]


In [26]:
# cuDF version
gdf = gdf[features].dropna()
median_price_gpu = gdf["Price"].median()
gdf["price_category"] = (gdf["Price"] > median_price_gpu).astype("int32")

# Label Encoding with cuML
for col in ["Property Type", "Town/City", "County"]:
    le = cuLabelEncoder()
    gdf[col] = le.fit_transform(gdf[col])

X_gdf = gdf.drop("price_category", axis=1)
y_gdf = gdf["price_category"]


In [27]:
X_train_pd, X_test_pd, y_train_pd, y_test_pd = train_test_split(X_pd, y_pd, test_size=0.2, random_state=42)

start = time.time()
sk_model = skRF(n_estimators=100, random_state=42)
sk_model.fit(X_train_pd, y_train_pd)
sk_train_time = time.time() - start

start = time.time()
y_pred_pd = sk_model.predict(X_test_pd)
sk_pred_time = time.time() - start

sk_acc = accuracy_score(y_test_pd, y_pred_pd)
sk_f1 = f1_score(y_test_pd, y_pred_pd)


In [28]:
X_train_gdf, X_test_gdf, y_train_gdf, y_test_gdf = train_test_split(X_gdf, y_gdf, test_size=0.2, random_state=42)

start = time.time()
cu_model = cuRF(n_estimators=100, random_state=42)
cu_model.fit(X_train_gdf, y_train_gdf)
cu_train_time = time.time() - start

start = time.time()
y_pred_gdf = cu_model.predict(X_test_gdf)
cu_pred_time = time.time() - start

# Convert predictions to CPU for scoring
cu_acc = accuracy_score(y_test_gdf.to_pandas(), y_pred_gdf.to_pandas())
cu_f1 = f1_score(y_test_gdf.to_pandas(), y_pred_gdf.to_pandas())


  return init_func(self, *args, **kwargs)


In [29]:
print("🔍 Model Comparison (1000 Samples)\n")
print(f"{'Metric':25} {'scikit-learn':>15} {'cuML (GPU)':>15}")
print("-" * 60)
print(f"{'Train Time (s)':25} {sk_train_time:15.4f} {cu_train_time:15.4f}")
print(f"{'Prediction Time (s)':25} {sk_pred_time:15.4f} {cu_pred_time:15.4f}")
print(f"{'Accuracy':25} {sk_acc:15.4f} {cu_acc:15.4f}")
print(f"{'F1 Score':25} {sk_f1:15.4f} {cu_f1:15.4f}")


🔍 Model Comparison (1000 Samples)

Metric                       scikit-learn      cuML (GPU)
------------------------------------------------------------
Train Time (s)                     0.3793          4.7260
Prediction Time (s)                0.0318          0.5506
Accuracy                           1.0000          0.8000
F1 Score                           1.0000          0.7959


Q3: Graph Analysis with cuGraph vs NetworkX
a) Construct a graph from the dataset:
• Nodes: Unique postcodes
• Edges: Connect postcodes from the same town or within the
same county
b) For both cuGraph and NetworkX, compute:
• Degree centrality
• Connected components
• PageRank (if applicable)
c) Time and compare results. Comment on:
• Performance differences
• Any challenges in using cuGraph (format conversion, graph types)

In [30]:
!pip install cugraph -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for cugraph (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for cugraph[0m[31m
[0m[31mERROR: ERROR: Failed to build installable wheels for some pyproject.toml based projects (cugraph)[0m[31m
[0m[?25h

In [32]:
cols = [
    "Transaction unique identifier", "Price", "Date of Transfer", "Postcode", "Property Type",
    "Old/New", "Duration", "Town/City", "District", "County",
    "PPDCategory Type", "Record Status - monthly file only"
]


In [33]:
cols = [
    "Transaction unique identifier",  # 0
    "Price",                          # 1
    "Date of Transfer",               # 2
    "Postcode",                       # 3
    "Property Type",                  # 4
    "Old/New",                        # 5
    "Duration",                       # 6
    "PAON",                           # 7
    "SAON",                           # 8
    "Street",                         # 9
    "Locality",                       # 10
    # Optional: add "Town/City", "District", "County", etc., if it's the *enhanced* version of the dataset.
]


In [34]:
cols = [
    "Transaction unique identifier", "Price", "Date of Transfer", "Postcode", "Property Type",
    "Old/New", "Duration", "PAON", "SAON", "Street", "Locality"
]

# Load first 1000 rows
pdf = pd.read_csv(file_path, names=cols, header=0, nrows=1000)
gdf = cudf.read_csv(file_path, names=cols, header=0, nrows=1000)


In [36]:
import pandas as pd
import cudf
import networkx as nx
import cugraph
import time




In [38]:
print(pdf.columns.tolist())


['Transaction unique identifier', 'Price', 'Date of Transfer', 'Postcode', 'Property Type', 'Old/New', 'Duration', 'PAON', 'SAON', 'Street', 'Locality']


In [39]:
from itertools import combinations

def build_edges(df, group_col):
    edges = set()
    for _, group in df.groupby(group_col):
        postcodes = group["Postcode"].dropna().unique()
        for src, dst in combinations(postcodes, 2):
            edges.add((src, dst))
    return list(edges)

# ✅ Use Locality and Street instead
edges_locality = build_edges(pdf, "Locality")
edges_street = build_edges(pdf, "Street")

# Combine and deduplicate
edges_all = list(set(edges_locality + edges_street))
edges_df = pd.DataFrame(edges_all, columns=["src", "dst"])

print(f"✅ Total edges constructed: {len(edges_df)}")


✅ Total edges constructed: 6


In [40]:
import networkx as nx
import time

# Build NetworkX graph
G_nx = nx.Graph()
G_nx.add_edges_from(edges_df.values)

# Run NetworkX algorithms
start = time.time()

nx_deg = nx.degree_centrality(G_nx)
nx_cc = list(nx.connected_components(G_nx))
nx_pr = nx.pagerank(G_nx)

nx_time = time.time() - start
print(f"✅ NetworkX analysis done in {nx_time:.4f} seconds")


✅ NetworkX analysis done in 0.0250 seconds


In [41]:
import cudf
import cugraph

# Convert edge list to cuDF
edges_cudf = cudf.DataFrame(edges_df)

# cuGraph requires numeric node IDs → encode postcodes as integers
unique_nodes = list(set(edges_df['src']) | set(edges_df['dst']))
node_id_map = {val: idx for idx, val in enumerate(unique_nodes)}

# Apply encoding
edges_cudf['src'] = edges_cudf['src'].map(node_id_map)
edges_cudf['dst'] = edges_cudf['dst'].map(node_id_map)

# Create graph and run cuGraph algorithms
start = time.time()

G_cu = cugraph.Graph()
G_cu.from_cudf_edgelist(edges_cudf, source='src', destination='dst', renumber=False)

cu_deg = cugraph.degree_centrality(G_cu)
cu_cc = cugraph.connected_components(G_cu)
cu_pr = cugraph.pagerank(G_cu)

cu_time = time.time() - start
print(f"✅ cuGraph analysis done in {cu_time:.4f} seconds")


✅ cuGraph analysis done in 1.1885 seconds




In [43]:
print("\n📊 Graph Analysis Comparison")
print(f"{'Metric':30} {'NetworkX':>12} {'cuGraph':>12}")
print("-" * 58)
print(f"{'Execution Time (s)':30} {nx_time:12.4f} {cu_time:12.4f}")
print(f"{'# Nodes':30} {G_nx.number_of_nodes():12} {G_cu.number_of_vertices():12}")
print(f"{'# Edges':30} {G_nx.number_of_edges():12} {G_cu.number_of_edges():12}")
print(f"{'# Connected Components':30} {len(nx_cc):12} {cu_cc['labels'].nunique():12}")



📊 Graph Analysis Comparison
Metric                             NetworkX      cuGraph
----------------------------------------------------------
Execution Time (s)                   0.0250       1.1885
# Nodes                                   4            4
# Edges                                   6            2
# Connected Components                    1            1
