In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# GPU Activation

In [2]:
!nvidia-smi

Fri Oct 25 13:09:54 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

fatal: destination path 'rapidsai-csp-utils' already exists and is not an empty directory.
Installing RAPIDS remaining 24.10.* libraries
Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com

        ***********************************************************************
        The pip install of RAPIDS is complete.
        
        Please do not run any further installation from the conda based installation methods, as they may cause issues!
        
        Please ensure that you're pulling from the git repo to remain updated with the latest working install scripts.

        Troubleshooting:
            - If there is an installation failure, please check back on RAPIDSAI owned templates/notebooks to see how to update your personal files. 
            - If an installation failure persists when using the latest script, please make an issue on https://github.com/rapidsai-community/rapidsai-csp-utils
        ****************************************************************

In [4]:
import cudf
cudf.__version__

'24.10.01'

In [5]:
import cuml
cuml.__version__

'24.10.00'

In [6]:
import cugraph
cugraph.__version__

'24.10.00'

In [7]:
import cuspatial
cuspatial.__version__

'24.10.00'

In [8]:
import cuxfilter
cuxfilter.__version__

'24.10.00'

# Random Forest

# Import

In [9]:
import math
import cudf
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from cuml.model_selection import GridSearchCV
from cuml.ensemble import RandomForestClassifier as cuRF

# Load

In [10]:
# Load dataset
train = pd.read_csv('/content/drive/MyDrive/ML dataset /train.csv')
test = pd.read_csv('/content/drive/MyDrive/ML dataset /test.csv')

# Split

In [11]:
X_train = train.drop(columns=[col for col in train.columns if isinstance(col, str) and col.startswith('genre_')])
y_train = train[[col for col in train.columns if isinstance(col, str) and col.startswith('genre_')]]
X_test = test.drop(columns=[col for col in test.columns if isinstance(col, str) and col.startswith('genre_')])
y_test = test[[col for col in test.columns if isinstance(col, str) and col.startswith('genre_')]]

In [12]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(130629, 1515)
(130629, 19)
(32658, 1515)
(32658, 19)


In [13]:
# Convert to cuDF DataFrames
X_train_cu = cudf.DataFrame.from_records(X_train.values)
y_train_cu = cudf.DataFrame.from_records(y_train.values)
X_test_cu = cudf.DataFrame.from_records(X_test.values)
y_test_cu = cudf.DataFrame.from_records(y_test.values)

# Training

In [17]:
import gc  # Import garbage collection
# Initialize a list to store predictions
y_pred_list = []

# Define the batch size
batch_size = 5  # Number of models to train at once
num_labels = y_train.shape[1]

# Loop over labels in batches
for i in range(0, num_labels, batch_size):
    for j in range(i, min(i + batch_size, num_labels)):
        # Create and fit the model for the current label
        rf_model = cuRF(n_estimators=100, max_depth=10, random_state=42)
        rf_model.fit(X_train_cu.to_numpy(), y_train_cu.iloc[:, j].to_numpy())

        # Predict on the test set for the current label
        y_pred = rf_model.predict(X_test_cu.to_numpy())
        y_pred_list.append(y_pred)  # Store predictions

        # Force garbage collection
        del rf_model  # Remove the model from memory
        gc.collect()  # Collect garbage

In [25]:

# Convert predictions to a 2D array and transpose to match the shape of y_test
y_pred_final = np.array(y_pred_list).T  # Transpose to match the shape of y_test

# Convert to cuDF DataFrame (optional, if you still want to keep it as cuDF)
y_pred_final_cu = cudf.DataFrame.from_records(y_pred_final)

# Convert y_test to a NumPy array for classification_report
y_test_np = y_test.to_numpy()

In [28]:
print('Random Forest:')
print('-----------------------------------------------------')
# Extract the genre column names and remove the 'genre_' prefix
genre_columns = [col.replace('genre_', '') for col in train.columns if isinstance(col, str) and col.startswith('genre_')]
print(classification_report(y_test_np, y_pred_final, target_names=genre_columns, zero_division=0))

Random Forest:
-----------------------------------------------------
                 precision    recall  f1-score   support

         Action       0.00      0.00      0.00      2988
      Adventure       0.00      0.00      0.00      1681
      Animation       0.00      0.00      0.00      2603
         Comedy       0.00      0.00      0.00      8162
          Crime       0.00      0.00      0.00      2220
    Documentary       0.00      0.00      0.00      5998
          Drama       0.66      0.01      0.02     12017
         Family       0.00      0.00      0.00      1683
        Fantasy       0.00      0.00      0.00      1434
        History       0.00      0.00      0.00      1002
         Horror       0.00      0.00      0.00      2961
          Music       0.00      0.00      0.00      1993
        Mystery       0.00      0.00      0.00      1279
        Romance       0.00      0.00      0.00      3280
Science Fiction       0.00      0.00      0.00      1360
       TV Movie   