<a href="https://colab.research.google.com/github/4en1x/diploma/blob/master/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin
!cd /usr/src
!ln -sfn linux-headers-2.6.35-28-generic linux

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-3l3c7qgd
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25ldone
[?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-l2rryjnm/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [29]:
%%cu
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <stdio.h>
#include "linux/kernel.h"
#include <iostream>
#include <ctime>
#include <iomanip>
#include <fstream>
#include <math.h>
#include <cmath>

using namespace std;

#define ITERATIONS 5000
#define N 1024

#define e 2.7182818

#define blockZSize 1

#define blockXSize 128
#define blockYSize 128

#define tileXSize 16
#define tileYSize 16

#define extN (N + 2 * blockZSize)
#define Dx 1
#define h ((float) Dx / (N + 1))
#define h2 powf(h, 2.0)

__device__
float f(int i, int j) {
	return 2 * powf(e, i * h + j * h);
}

float u_t(float x) {
	return powf(e, x);
}

float u_l(float x) {
	return powf(e, x);
}

float u_r(float x) {
	return powf(e, x + 1);
}

float u_d(float x) {
	return powf(e, x + 1);
}


__device__
void tile(float* matrix, int iGl, int jGl, bool isBlack) {
	int iBegin = iGl * blockXSize + threadIdx.x * tileXSize + blockZSize;
	int jBegin = jGl * blockYSize + threadIdx.y * tileYSize + blockZSize;

	for (int i = iBegin; i < iBegin + tileXSize; i++) {
		int offset = i % 2;
		if (isBlack) {
			offset = 1 - offset;
		}
		for (int j = jBegin + offset; j < jBegin + tileYSize; j += 2) {
			matrix[extN * i + j] = 0.25 * (
          matrix[extN * (i + 1) + j]
        + matrix[extN * (i - 1) + j]
        + matrix[extN * i + j + 1]
				+ matrix[extN * i + j - 1]
        - h2 * f(i, j)
      );
		}
	}
}

__global__
void calculateBlack(float* matrix) {
	tile(matrix, blockIdx.x, blockIdx.y, true);
}

__global__
void calculateRed(float* matrix) {
	tile(matrix, blockIdx.x, blockIdx.y, false);
}

float func(int i, int j) {
	return powf(e, i * h + j * h);
}

float* createMatrix(int row, int col) {
	float* matrix = new float[row * col];

	for(int i = 0; i < row; i++) {
		for(int j = 0; j < col; j++) {
			matrix[i * row + j] = 0;
		}
	}

  for(int k = 0; k < blockZSize; k++) {
    for(int i = k; i < row - k; i++) {
      matrix[k * row + i] = u_t((i - blockZSize + 1) * h);
      matrix[(row - 1 - k) * row + i] = u_d((i - blockZSize + 1) * h);
    }
	}

  for(int k = 0; k < blockZSize; k++) {
    for(int j = k; j < col - k; j++) {
      matrix[j * col + k] = u_l((j - blockZSize + 1) * h);
      matrix[j * col + row - 1 - k] = u_r((j - blockZSize + 1) * h);
	  }
  }
    
	return matrix;
}

float calculateDifference(float* matrix1) {
	float tolerance = 0.0f;
  for (int i = blockZSize; i < extN - blockZSize; i++) {
		for (int j = blockZSize; j < extN - blockZSize; j++) {
      float fCurrent = func(i - blockZSize + 1, j - blockZSize + 1);
      tolerance += (matrix1[i * extN + j] - fCurrent) * (matrix1[i * extN + j] - fCurrent);
		}
	}
	return tolerance;
}

int main() {
  ofstream myfile;
  myfile.open("diff_1.txt");
    
	float* matrixHost = createMatrix(extN, extN);
	float* matrixDevice;
	
	int matrixSize = sizeof(float) * extN * extN;

	cudaMalloc((void**)&matrixDevice, matrixSize);
	cudaMemcpy(matrixDevice, matrixHost, matrixSize, cudaMemcpyHostToDevice);

	long startTime = clock();
	dim3 blocks(N / blockXSize, N / blockYSize);
	dim3 threads(blockXSize / tileXSize, blockYSize / tileYSize);

	for (size_t i = 0; i < ITERATIONS; i++) {
		calculateBlack <<< blocks, threads >>>(matrixDevice);
		cudaDeviceSynchronize();
		calculateRed <<< blocks, threads >>>(matrixDevice);
		cudaPeekAtLastError();
		cudaDeviceSynchronize();


    cudaMemcpy(matrixHost, matrixDevice, matrixSize, cudaMemcpyDeviceToHost);
    float diff = calculateDifference(matrixHost);
    myfile << diff << " ";
    cout << diff << " " << i << ";";
	}

	std::cout << "Calculation take time: " << clock() - startTime << std::endl;
  cout << " Difference: " << calculateDifference(matrixHost);
	cudaError_t cudaResult = cudaMemcpy(matrixHost, matrixDevice, matrixSize, cudaMemcpyDeviceToHost);

	if (cudaResult != cudaSuccess) {
		std::cout << "Cuda error: " << cudaResult << std::endl;
	}

  myfile.close();
    
	return 0;
}

'3.09098e+06 0;3.08682e+06 1;3.08358e+06 2;3.08084e+06 3;3.07841e+06 4;3.07621e+06 5;3.07419e+06 6;3.07232e+06 7;3.07053e+06 8;3.06885e+06 9;3.06726e+06 10;3.06575e+06 11;3.06428e+06 12;3.06288e+06 13;3.06151e+06 14;3.06021e+06 15;3.05893e+06 16;3.05774e+06 17;3.05655e+06 18;3.05537e+06 19;3.05423e+06 20;3.0531e+06 21;3.05202e+06 22;3.05096e+06 23;3.04992e+06 24;3.04888e+06 25;3.04789e+06 26;3.0469e+06 27;3.04594e+06 28;3.045e+06 29;3.04407e+06 30;3.04314e+06 31;3.04224e+06 32;3.04135e+06 33;3.04047e+06 34;3.0396e+06 35;3.03874e+06 36;3.0379e+06 37;3.03707e+06 38;3.03624e+06 39;3.03543e+06 40;3.03463e+06 41;3.03384e+06 42;3.03306e+06 43;3.0323e+06 44;3.03153e+06 45;3.03077e+06 46;3.03003e+06 47;3.02929e+06 48;3.02856e+06 49;3.02785e+06 50;3.02715e+06 51;3.0265e+06 52;3.0258e+06 53;3.0251e+06 54;3.02441e+06 55;3.02372e+06 56;3.02308e+06 57;3.02241e+06 58;3.02174e+06 59;3.02107e+06 60;3.02042e+06 61;3.01977e+06 62;3.01912e+06 63;3.01847e+06 64;3.01784e+06 65;3.0172e+06 66;3.01659e+06 67;

In [30]:
%%cu
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <stdio.h>
#include "linux/kernel.h"
#include <iostream>
#include <ctime>
#include <iomanip>
#include <fstream>
#include <math.h>
#include <cmath>

using namespace std;

#define ITERATIONS 5000
#define N 1024

#define e 2.7182818

#define blockZSize 1

#define blockXSize 128
#define blockYSize 128

#define tileXSize 16
#define tileYSize 16

#define extN (N + 2 * blockZSize)
#define Dx 1
#define h ((float) Dx / (N + 1))
#define h2 powf(h, 2.0)

__device__
float f(int i, int j) {
	return 2 * powf(e, i * h + j * h);
}

float u_t(float x) {
	return powf(e, x);
}

float u_l(float x) {
	return powf(e, x);
}

float u_r(float x) {
	return powf(e, x + 1);
}

float u_d(float x) {
	return powf(e, x + 1);
}


__device__
void tile(float* matrix, int iGl, int jGl, bool isBlack) {
	int iBegin = iGl * blockXSize + threadIdx.x * tileXSize + blockZSize;
	int jBegin = jGl * blockYSize + threadIdx.y * tileYSize + blockZSize;

	for (int i = iBegin; i < iBegin + tileXSize; i++) {
		for (int j = jBegin; j < jBegin + tileYSize; j += 1) {
			matrix[extN * i + j] = 0.25 * (
          matrix[extN * (i + 1) + j]
        + matrix[extN * (i - 1) + j]
        + matrix[extN * i + j + 1]
				+ matrix[extN * i + j - 1]
        - h2 * f(i, j)
      );
		}
	}
}

__global__
void calculateBlack(float* matrix) {
	tile(matrix, blockIdx.x, blockIdx.y, true);
}

__global__
void calculateRed(float* matrix) {
	tile(matrix, blockIdx.x, blockIdx.y, false);
}

float func(int i, int j) {
	return powf(e, i * h + j * h);
}

float* createMatrix(int row, int col) {
	float* matrix = new float[row * col];

	for(int i = 0; i < row; i++) {
		for(int j = 0; j < col; j++) {
			matrix[i * row + j] = 0;
		}
	}

  for(int k = 0; k < blockZSize; k++) {
    for(int i = k; i < row - k; i++) {
      matrix[k * row + i] = u_t((i - blockZSize + 1) * h);
      matrix[(row - 1 - k) * row + i] = u_d((i - blockZSize + 1) * h);
    }
	}

  for(int k = 0; k < blockZSize; k++) {
    for(int j = k; j < col - k; j++) {
      matrix[j * col + k] = u_l((j - blockZSize + 1) * h);
      matrix[j * col + row - 1 - k] = u_r((j - blockZSize + 1) * h);
	  }
  }
    
	return matrix;
}

float calculateDifference(float* matrix1) {
	float tolerance = 0.0f;
  for (int i = blockZSize; i < extN - blockZSize; i++) {
		for (int j = blockZSize; j < extN - blockZSize; j++) {
      float fCurrent = func(i - blockZSize + 1, j - blockZSize + 1);
      tolerance += (matrix1[i * extN + j] - fCurrent) * (matrix1[i * extN + j] - fCurrent);
		}
	}
	return tolerance;
}

int main() {
  ofstream myfile;
  myfile.open("diff_2.txt");
    
	float* matrixHost = createMatrix(extN, extN);
	float* matrixDevice;
	
	int matrixSize = sizeof(float) * extN * extN;

	cudaMalloc((void**)&matrixDevice, matrixSize);
	cudaMemcpy(matrixDevice, matrixHost, matrixSize, cudaMemcpyHostToDevice);

	long startTime = clock();
	dim3 blocks(N / blockXSize, N / blockYSize);
	dim3 threads(blockXSize / tileXSize, blockYSize / tileYSize);

	for (size_t i = 0; i < ITERATIONS; i++) {
		calculateBlack <<< blocks, threads >>>(matrixDevice);
		cudaDeviceSynchronize();
		cudaPeekAtLastError();

    cudaMemcpy(matrixHost, matrixDevice, matrixSize, cudaMemcpyDeviceToHost);
    float diff = calculateDifference(matrixHost);
    myfile << diff << " ";
    cout << diff << " " << i << ";";
	}

	std::cout << "Calculation take time: " << clock() - startTime << std::endl;
  cout << " Difference: " << calculateDifference(matrixHost);
	cudaError_t cudaResult = cudaMemcpy(matrixHost, matrixDevice, matrixSize, cudaMemcpyDeviceToHost);

	if (cudaResult != cudaSuccess) {
		std::cout << "Cuda error: " << cudaResult << std::endl;
	}

  myfile.close();
    
	return 0;
}

'3.09059e+06 0;3.08694e+06 1;3.08393e+06 2;3.08133e+06 3;3.07901e+06 4;3.07689e+06 5;3.07492e+06 6;3.07309e+06 7;3.07134e+06 8;3.06972e+06 9;3.06815e+06 10;3.06663e+06 11;3.06522e+06 12;3.06386e+06 13;3.0625e+06 14;3.0612e+06 15;3.05995e+06 16;3.05876e+06 17;3.05758e+06 18;3.0564e+06 19;3.05527e+06 20;3.0542e+06 21;3.05311e+06 22;3.05203e+06 23;3.05101e+06 24;3.05001e+06 25;3.04902e+06 26;3.04806e+06 27;3.04706e+06 28;3.04611e+06 29;3.04519e+06 30;3.04429e+06 31;3.04337e+06 32;3.0425e+06 33;3.04162e+06 34;3.04078e+06 35;3.03989e+06 36;3.03907e+06 37;3.03823e+06 38;3.03742e+06 39;3.0366e+06 40;3.03582e+06 41;3.035e+06 42;3.03423e+06 43;3.03348e+06 44;3.03271e+06 45;3.03199e+06 46;3.03125e+06 47;3.03048e+06 48;3.02976e+06 49;3.02904e+06 50;3.02835e+06 51;3.02771e+06 52;3.02702e+06 53;3.02633e+06 54;3.02562e+06 55;3.02497e+06 56;3.02432e+06 57;3.02362e+06 58;3.02296e+06 59;3.02231e+06 60;3.02164e+06 61;3.02099e+06 62;3.02036e+06 63;3.0197e+06 64;3.01907e+06 65;3.01845e+06 66;3.01782e+06 6

In [0]:
%%cu
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <stdio.h>
#include "linux/kernel.h"
#include <iostream>
#include <ctime>
#include <iomanip>
#include <fstream>
#include <math.h>
#include <cmath>

using namespace std;

#define ITERATIONS 30000
#define N 1024

#define e 2.7182818

#define blockZSize 1

#define blockXSize 128
#define blockYSize 128

#define tileXSize 16
#define tileYSize 16

#define extN (N + 2 * blockZSize)
#define Dx 1
#define h ((float) Dx / (N + 1))
#define h2 powf(h, 2.0)

__device__
float f(int i, int j) {
	return 2 * powf(e, i * h + j * h);
}

float u_t(float x) {
	return powf(e, x);
}

float u_l(float x) {
	return powf(e, x);
}

float u_r(float x) {
	return powf(e, x + 1);
}

float u_d(float x) {
	return powf(e, x + 1);
}


__device__
void tile(float* matrix, int iGl, int jGl, bool isBlack) {
	int iBegin = iGl * blockXSize + threadIdx.x * tileXSize + blockZSize;
	int jBegin = jGl * blockYSize + threadIdx.y * tileYSize + blockZSize;

	for (int i = iBegin; i < iBegin + tileXSize; i++) {
		for (int j = jBegin; j < jBegin + tileYSize; j += 1) {
			matrix[extN * i + j] = 0.25 * (
          matrix[extN * (i + 1) + j]
        + matrix[extN * (i - 1) + j]
        + matrix[extN * i + j + 1]
				+ matrix[extN * i + j - 1]
        - h2 * f(i, j)
      );
		}
	}
}

__global__
void calculateBlack(float* matrix) {
	tile(matrix, blockIdx.x, blockIdx.y, true);
}

__global__
void calculateRed(float* matrix) {
	tile(matrix, blockIdx.x, blockIdx.y, false);
}

float func(int i, int j) {
	return powf(e, i * h + j * h);
}

float* createMatrix(int row, int col) {
	float* matrix = new float[row * col];

	for(int i = 0; i < row; i++) {
		for(int j = 0; j < col; j++) {
			matrix[i * row + j] = 0;
		}
	}

  for(int k = 0; k < blockZSize; k++) {
    for(int i = k; i < row - k; i++) {
      matrix[k * row + i] = u_t((i - blockZSize + 1) * h);
      matrix[(row - 1 - k) * row + i] = u_d((i - blockZSize + 1) * h);
    }
	}

  for(int k = 0; k < blockZSize; k++) {
    for(int j = k; j < col - k; j++) {
      matrix[j * col + k] = u_l((j - blockZSize + 1) * h);
      matrix[j * col + row - 1 - k] = u_r((j - blockZSize + 1) * h);
	  }
  }
    
	return matrix;
}

float calculateDifference(float* matrix1) {
	float tolerance = 0.0f;
  for (int i = blockZSize; i < extN - blockZSize; i++) {
		for (int j = blockZSize; j < extN - blockZSize; j++) {
      float fCurrent = func(i - blockZSize + 1, j - blockZSize + 1);
      tolerance += (matrix1[i * extN + j] - fCurrent) * (matrix1[i * extN + j] - fCurrent);
		}
	}
	return tolerance;
}

int main() {
  ofstream myfile;
  myfile.open("diff_2.txt");
    
	float* matrixHost = createMatrix(extN, extN);
	float* matrixDevice;
	
	int matrixSize = sizeof(float) * extN * extN;

	cudaMalloc((void**)&matrixDevice, matrixSize);
	cudaMemcpy(matrixDevice, matrixHost, matrixSize, cudaMemcpyHostToDevice);

	long startTime = clock();
	dim3 blocks(N / blockXSize, N / blockYSize);
	dim3 threads(blockXSize / tileXSize, blockYSize / tileYSize);

	for (size_t i = 0; i < ITERATIONS; i++) {
		calculateBlack <<< blocks, threads >>>(matrixDevice);
		cudaDeviceSynchronize();
		cudaPeekAtLastError();

    cudaMemcpy(matrixHost, matrixDevice, matrixSize, cudaMemcpyDeviceToHost);
    float diff = calculateDifference(matrixHost);
    myfile << diff << " ";
    cout << diff << " " << i << ";";
	}

	std::cout << "Calculation take time: " << clock() - startTime << std::endl;
  cout << " Difference: " << calculateDifference(matrixHost);
	cudaError_t cudaResult = cudaMemcpy(matrixHost, matrixDevice, matrixSize, cudaMemcpyDeviceToHost);

	if (cudaResult != cudaSuccess) {
		std::cout << "Cuda error: " << cudaResult << std::endl;
	}

  myfile.close();
    
	return 0;
}