In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# GPU Activation

In [2]:
!nvidia-smi

Fri Oct 25 13:52:17 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

fatal: destination path 'rapidsai-csp-utils' already exists and is not an empty directory.
Installing RAPIDS remaining 24.10.* libraries
Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com

        ***********************************************************************
        The pip install of RAPIDS is complete.
        
        Please do not run any further installation from the conda based installation methods, as they may cause issues!
        
        Please ensure that you're pulling from the git repo to remain updated with the latest working install scripts.

        Troubleshooting:
            - If there is an installation failure, please check back on RAPIDSAI owned templates/notebooks to see how to update your personal files. 
            - If an installation failure persists when using the latest script, please make an issue on https://github.com/rapidsai-community/rapidsai-csp-utils
        ****************************************************************

In [4]:
import cudf
cudf.__version__

'24.10.01'

In [5]:
import cuml
cuml.__version__

'24.10.00'

In [6]:
import cugraph
cugraph.__version__

'24.10.00'

In [7]:
import cuspatial
cuspatial.__version__

'24.10.00'

In [8]:
import cuxfilter
cuxfilter.__version__

'24.10.00'

# KNN

# Import

In [9]:
import pandas as pd
import numpy as np
from cuml.neighbors import KNeighborsClassifier as cuKNeighbors
from cuml.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Load

In [10]:
# Load dataset
train = pd.read_csv('/content/drive/MyDrive/ML dataset /train.csv')
test = pd.read_csv('/content/drive/MyDrive/ML dataset /test.csv')

# Split

In [11]:
X_train = train.drop(columns=[col for col in train.columns if isinstance(col, str) and col.startswith('genre_')])
y_train = train[[col for col in train.columns if isinstance(col, str) and col.startswith('genre_')]]
X_test = test.drop(columns=[col for col in test.columns if isinstance(col, str) and col.startswith('genre_')])
y_test = test[[col for col in test.columns if isinstance(col, str) and col.startswith('genre_')]]

In [12]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(130629, 1515)
(130629, 19)
(32658, 1515)
(32658, 19)


# Training

In [13]:
from sklearn.utils import shuffle


# Define a smaller batch size for grid search
subsample_size = 50000

# Shuffle and subsample the data for grid search
X_train_sub, y_train_sub = shuffle(X_train, y_train, n_samples=subsample_size)

# Create a KNN model
knn = cuKNeighbors()

# Define a grid of hyperparameter values
param_grid = {
    'n_neighbors': np.arange(1, 9),
}

# Perform grid search using the subsample
grid_search = GridSearchCV(knn, param_grid, cv=5, verbose=3)
grid_search.fit(X_train_sub, y_train_sub)

# Print the best parameters from the grid search
print("Best parameters found: ", grid_search.best_params_)

[I] [13:54:25.608707] Unused keyword parameter: n_jobs during cuML estimator initialization
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[I] [13:54:25.611828] Unused keyword parameter: n_jobs during cuML estimator initialization
[CV 1/5] END .....................n_neighbors=1;, score=0.831 total time=   4.3s
[I] [13:54:29.881278] Unused keyword parameter: n_jobs during cuML estimator initialization
[CV 2/5] END .....................n_neighbors=1;, score=0.830 total time=   3.1s
[I] [13:54:32.963458] Unused keyword parameter: n_jobs during cuML estimator initialization
[CV 3/5] END .....................n_neighbors=1;, score=0.835 total time=   2.9s
[I] [13:54:35.844288] Unused keyword parameter: n_jobs during cuML estimator initialization
[CV 4/5] END .....................n_neighbors=1;, score=0.840 total time=   2.3s
[I] [13:54:38.191491] Unused keyword parameter: n_jobs during cuML estimator initialization
[CV 5/5] END .....................n_neighbors=1;, score=0.837 to

In [14]:
# Train the KNN on the full dataset using the best parameters
best_knn = cuKNeighbors(n_neighbors=grid_search.best_params_['n_neighbors'])

# Train the model on the full dataset
best_knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_knn.predict(X_test)


In [15]:
print('KNN:')
print('-----------------------------------------------------')
# Extract the genre column names and remove the 'genre_' prefix
genre_columns = [col.replace('genre_', '') for col in train.columns if isinstance(col, str) and col.startswith('genre_')]
print(classification_report(y_test, y_pred, target_names=genre_columns, zero_division=0))

KNN:
-----------------------------------------------------
                 precision    recall  f1-score   support

         Action       0.41      0.01      0.02      2988
      Adventure       0.73      0.01      0.03      1681
      Animation       0.25      0.00      0.00      2603
         Comedy       0.33      0.05      0.08      8162
          Crime       0.00      0.00      0.00      2220
    Documentary       0.27      0.02      0.04      5998
          Drama       0.40      0.10      0.16     12017
         Family       0.29      0.00      0.00      1683
        Fantasy       0.40      0.00      0.00      1434
        History       0.00      0.00      0.00      1002
         Horror       0.29      0.00      0.01      2961
          Music       0.00      0.00      0.00      1993
        Mystery       0.00      0.00      0.00      1279
        Romance       0.10      0.00      0.00      3280
Science Fiction       0.73      0.01      0.01      1360
       TV Movie       0.00  