In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0


In [2]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-11ac0n65
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-11ac0n65
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4304 sha256=430702b1ad62622e496390bc23d1fabe6617756155d7cdd5a5f1945e928ef294
  Stored in directory: /tmp/pip-ephem-wheel-cache-s5tf1j7m/wheels/f3/08/cc/e2b5b0e1c92df07dbb50a6f024a68ce090f5e7b2316b41756d
Successfully built NVCCPlugin
Installing collecte

In [3]:
%load_ext nvcc_plugin
!nvidia-smi

created output directory at /content/src
Out bin /content/result.out
Sat Jan  7 14:54:24 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    26W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+------------------

In [4]:
%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <stdio.h>
#include <ctime>
#include <vector>


using namespace std;

clock_t c_start, c_end;
int n = 1024 * 1024 * 32; // Количество точек

__global__ void count_pi(float* dev_randX, float* dev_randY, int* dev_threads_num, int n) {   
	int tid = threadIdx.x + blockIdx.x * blockDim.x;                                            

	int cont = 0;
	for (int i = tid * 128; i < 128 * (tid + 1); i++) {
		if (dev_randX[i] * dev_randX[i] + dev_randY[i] * dev_randY[i] < 1.0f) { 
			cont++;
		}
	}
	dev_threads_num[tid] = cont;
}

int main() {

	vector<float> randX(n);                   
	vector<float> randY(n);                  

	srand((unsigned)time(NULL));              
	for (int i = 0; i < n; i++) {
		randX[i] = float(rand()) / RAND_MAX;    
		randY[i] = float(rand()) / RAND_MAX;    
	}

	c_start = clock();                        
	int c_count = 0;                          
	for (int i = 0; i < n; i++) {
		if (randX[i] * randX[i] + randY[i] * randY[i] < 1.0f) { 
			c_count++;                                            
		}
	}
	c_end = clock();                                         
	float t_cpu = (float)(c_end - c_start) / CLOCKS_PER_SEC; 
	float c_num = float(c_count) * 4.0 / n;                  
	cout << "CPU Time" << endl;                              
	cout << "PI_value = " << c_num << endl;
	cout << "time= " << t_cpu * 1000 << " ms" << endl;

	cudaEvent_t start, stop;
	cudaEventCreate(&start);     
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);   

	size_t size = n * sizeof(float);
	float* dev_randX;
	float* dev_randY;
	cudaMalloc((void**)&dev_randX, size);     
	cudaMalloc((void**)&dev_randY, size);

	cudaMemcpy(dev_randX, &randX.front(), size, cudaMemcpyHostToDevice); 
	cudaMemcpy(dev_randY, &randY.front(), size, cudaMemcpyHostToDevice);

	int threadsPerBlock = 512;                                           
	int block_num = n / (128 * threadsPerBlock);
	int* dev_threads_num;
	cudaMalloc((void**)&dev_threads_num, n / 128 * sizeof(int));         

	count_pi << <block_num, threadsPerBlock >> > (dev_randX, dev_randY, dev_threads_num, n);  

	int* threads_num = new int[n / 128];
	cudaMemcpy(threads_num, dev_threads_num, n / 128 * sizeof(int), cudaMemcpyDeviceToHost);  

	int g_count = 0;
	for (int i = 0; i < n / 128; i++) {
		g_count += threads_num[i];                
	};

	cudaEventRecord(stop, 0);  
	cudaEventSynchronize(stop);
	float t_gpu;
	cudaEventElapsedTime(&t_gpu, start, stop);  
	cudaEventDestroy(start);                    
	cudaEventDestroy(stop);                     

	float g_num = float(g_count) * 4.0 / n; 
	cout << "GPU Time" << endl;             
  cout << "PI_value = " << g_num << endl;
	cout << "time = " << t_gpu << " ms" << endl;
}

CPU Time
PI_value = 3.14158
time= 458.325 ms
GPU Time
PI_value = 3.14158
time = 69.2399 ms

