In [1]:
!nvidia-smi


Sun Jan 25 16:30:03 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   68C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [28]:
%%writefile task3_cpu.cpp

#include <iostream>
#include <vector>
#include <chrono>

#define N 100000000  // 100 млн элементов

void cpuMul(std::vector<int>& a) {
    for (int i = 0; i < N; i++) {
        a[i] *= 2;
    }
}

int main() {
    std::vector<int> a(N, 1);

    auto start = std::chrono::high_resolution_clock::now();
    cpuMul(a);
    auto end = std::chrono::high_resolution_clock::now();

    std::chrono::duration<double, std::milli> time = end - start;

    std::cout << "CPU time (ms): " << time.count() << std::endl;
    std::cout << "CPU a[0]: " << a[0] << " a[N-1]: " << a[N-1] << std::endl;
    return 0;
}



Writing task3_cpu.cpp


In [29]:
%%writefile task3_gpu.cu

#include <iostream>
#include <vector>
#include <chrono>
#include <cuda_runtime.h>

#define N 100000000
#define THREADS 256

#define CUDA_CHECK(err) \
    if (err != cudaSuccess) { \
        std::cerr << "CUDA error: " << cudaGetErrorString(err) << std::endl; \
        exit(1); \
    }

__global__ void mulKernel(int* a, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        a[idx] *= 2;
    }
}

int main() {
    std::vector<int> h_a(N, 1);
    int* d_a;

    CUDA_CHECK(cudaMalloc(&d_a, N * sizeof(int)));
    CUDA_CHECK(cudaMemcpy(d_a, h_a.data(), N * sizeof(int), cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);

    int blocks = (N + THREADS - 1) / THREADS;
    mulKernel<<<blocks, THREADS>>>(d_a, N);
    CUDA_CHECK(cudaDeviceSynchronize());

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float gpu_time;
    cudaEventElapsedTime(&gpu_time, start, stop);

    CUDA_CHECK(cudaMemcpy(h_a.data(), d_a, N * sizeof(int), cudaMemcpyDeviceToHost));

    std::cout << "GPU time (ms): " << gpu_time << std::endl;
    std::cout << "GPU a[0]: " << h_a[0] << " a[N-1]: " << h_a[N-1] << std::endl;

    cudaFree(d_a);
    return 0;
}


Writing task3_gpu.cu


In [43]:
%%writefile task3_hybrid.cu
#include <iostream>
#include <vector>
#include <chrono>
#include <thread>
#include <cuda_runtime.h>

#define N 100000000
#define THREADS 256

#define CUDA_CHECK(err) \
    if (err != cudaSuccess) { \
        std::cerr << "CUDA error: " << cudaGetErrorString(err) << std::endl; \
        exit(1); \
    }

__global__ void mulKernel(int* a, int offset, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int globalIdx = offset + idx;
    if (globalIdx < n) {
        a[globalIdx] *= 2;
    }
}

void cpuMul(int* a, int start, int end) {
    for (int i = start; i < end; i++) {
        a[i] *= 2;
    }
}

int main() {
    std::vector<int> a(N, 1);
    int* d_a;

    CUDA_CHECK(cudaMalloc(&d_a, N * sizeof(int)));
    CUDA_CHECK(cudaMemcpy(d_a, a.data(), N * sizeof(int), cudaMemcpyHostToDevice));

    int half = N / 2;

    auto start = std::chrono::high_resolution_clock::now();

    // Запускаем CPU часть в отдельном потоке
    std::thread cpuThread(cpuMul, a.data(), 0, half);

    // GPU часть в главном потоке
    int blocks = (half + THREADS - 1) / THREADS;
    mulKernel<<<blocks, THREADS>>>(d_a, half, N);
    CUDA_CHECK(cudaDeviceSynchronize());

    // ждём CPU
    cpuThread.join();

    // копируем GPU часть обратно
    CUDA_CHECK(cudaMemcpy(a.data() + half, d_a + half, half * sizeof(int), cudaMemcpyDeviceToHost));

    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> hybrid_time = end - start;

    std::cout << "Hybrid time (ms): " << hybrid_time.count() << std::endl;
    std::cout << "Hybrid a[0]: " << a[0] << " a[N-1]: " << a[N-1] << std::endl;

    cudaFree(d_a);
    return 0;
}


Overwriting task3_hybrid.cu


In [31]:
!g++ task3_cpu.cpp -O2 -o task3_cpu

In [32]:
!./task3_cpu

CPU time (ms): 90.721
CPU a[0]: 2 a[N-1]: 2


In [33]:
!nvcc -arch=sm_75 task3_gpu.cu -O2 -o task3_gpu


In [34]:
!./task3_gpu

GPU time (ms): 3.4903
GPU a[0]: 2 a[N-1]: 2


In [45]:
!nvcc -arch=sm_75 task3_hybrid.cu -O2 -o task3_hybrid


In [47]:
!./task3_hybrid

Hybrid time (ms): 74.7657
Hybrid a[0]: 2 a[N-1]: 2
