In [1]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-3gd08j7p
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-3gd08j7p
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4294 sha256=8c6de3b49ee548050bc0464dc52d37efdfa596ca659a8604c89047b0dd784f73
  Stored in directory: /tmp/pip-ephem-wheel-cache-04zy21st/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
created output directory at /content

# **Algoritmo sequenziale:**
Implementazione algoritmo Greedy per la colorazione del grafo.

## **Funzionamento:**

*   Costruzione del grafo $\mathcal{G}(n, prob)$, dove $n$ è il numero di nodi e $prob$ la probabilità di estrazione di un arco.
*   Selezione di una permutazione casuale $σ$ sull' insieme $\{0 .. n\}.$
*   $∀i \in \{0 .. n\}$ il vertice $v_{σ_i}$ viene colorato con il primo colore disponibile.

## **Struttura:**

*   CPUColorer.cu → implementazione del colorer.
*   testerColorer.cu → costruzione del grafo e misurazione del tempo di esecuzione.



##**Risultati:**
Con un grafo $\mathcal{G}(10000, 1)$ si ottiene: 941.029419 ms

##**Problematica:**
Rispetto all'algoritmo implementato in parallelo risulta molto più veloce, quando il tempo di esecuzione dovrebbe essere $\mathcal{O}(n^2)$ nel caso di grafo completo.



In [302]:
%%cuda --name CPUColorer.cu

#include <iostream>
#include <cstdio>
#include <thrust/sequence.h>
#include <thrust/shuffle.h>
#include <thrust/random.h>
#include <thrust/count.h>

#include "/content/drive/MyDrive/graphcoloring/graph/coloring.h"
#include "/content/drive/MyDrive/graphcoloring/graph/graph_d.h"
#include "/content/drive/MyDrive/graphcoloring/graph/graph.h"
#include "/content/drive/MyDrive/graphcoloring/utils/common.h"

using namespace std;

void CPUcolorer(Coloring * col, GraphStruct *str){
	int n = str->nodeSize;

  int * perm = (int *) malloc(n * sizeof(int));
  thrust::sequence(perm, perm + n);
  thrust::default_random_engine g;
  thrust::shuffle(perm, perm + n, g);
 int counter = 0;

  for(int i = 0; i < n; i++){
    int currentNode = perm[i];
    uint offset = str->cumDegs[currentNode];
    uint deg = str->deg(currentNode);
    printf("%d ", counter);

    for (uint j = 0; j < deg; j++) {
        counter++;
      uint neighID = str->neighs[offset + j];
      int jColor = col->coloring[neighID];
      if (jColor != -1) col->usedColors[jColor] = true;

    }

    for(uint c = 0; c < n; c++){
      if(!col->usedColors[c]){
          col->coloring[currentNode] = c;
          break;
      }
    }
  }
}

Coloring* graphColoring(GraphStruct *str){
	int n = str->nodeSize;

	Coloring* col = (Coloring * ) malloc(sizeof(Coloring));

  col->coloring = (int *) malloc(n * sizeof(int));
	thrust::fill(col->coloring, col->coloring + n, -1);

  col->usedColors = (bool * ) malloc(n * sizeof(bool));
  thrust::fill(col->usedColors, col->usedColors + n, false);

  CPUcolorer(col, str);

	return col;
}

'File written in /content/src/CPUColorer.cu'

In [303]:
%%cuda --name testerColorer.cu

#include <iostream>
#include "/content/drive/MyDrive/graphcoloring/graph/coloring.h"
#include "/content/drive/MyDrive/graphcoloring/graph/graph.h"
#include "/content/drive/MyDrive/graphcoloring/utils/common.h"

#define THREADxBLOCK 128

// da fare: aggiungere stream? ad esempio per eseguire in modo concorrente stampa del grafo e la sua colorazione

int main(void) {
    unsigned int n = 10000;		 // number of nodes for random graphs
    float prob = 1;				    // density (percentage) for random graphs
    std::default_random_engine eng{0};  // fixed seed

    srand(time(0));
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // new graph with n nodes
    Graph graph(n, 1);

    // generate a random graph
    graph.randGraph(prob, eng);

    // get the graph struct
    GraphStruct *str = graph.getStruct();

    //print_d <<< 1, 1 >>> (str, true);
    //cudaDeviceSynchronize();

    cudaEventRecord(start);

    Coloring* col = graphColoring(str);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    //Stampo in millisecondi quanto tempo ci ha messo a colorare il grafo.
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("\n%f ms\n", milliseconds);

    int maxColor = 0;
    printf("Coloratura trovata: ");
    for(int i = 0; i < str->nodeSize; i++){
        if(maxColor < col->coloring[i]) maxColor = col->coloring[i];
        printf("%d ", col->coloring[i]);
    }
    printf("\nColore massimo: %d", maxColor+1);
    //printColoring(col, str, 1);

    return EXIT_SUCCESS;
}


'File written in /content/src/testerColorer.cu'

In [304]:
!nvcc -dc src/testerColorer.cu /content/src/CPUColorer.cu /content/drive/MyDrive/graphcoloring/graph/graph.cpp /content/drive/MyDrive/graphcoloring/graph/graph_d.cu
!nvcc testerColorer.o CPUColorer.o graph.o graph_d.o -o testerColorer
!./testerColorer

0 9999 19998 29997 39996 49995 59994 69993 79992 89991 99990 109989 119988 129987 139986 149985 159984 169983 179982 189981 199980 209979 219978 229977 239976 249975 259974 269973 279972 289971 299970 309969 319968 329967 339966 349965 359964 369963 379962 389961 399960 409959 419958 429957 439956 449955 459954 469953 479952 489951 499950 509949 519948 529947 539946 549945 559944 569943 579942 589941 599940 609939 619938 629937 639936 649935 659934 669933 679932 689931 699930 709929 719928 729927 739926 749925 759924 769923 779922 789921 799920 809919 819918 829917 839916 849915 859914 869913 879912 889911 899910 909909 919908 929907 939906 949905 959904 969903 979902 989901 999900 1009899 1019898 1029897 1039896 1049895 1059894 1069893 1079892 1089891 1099890 1109889 1119888 1129887 1139886 1149885 1159884 1169883 1179882 1189881 1199880 1209879 1219878 1229877 1239876 1249875 1259874 1269873 1279872 1289871 1299870 1309869 1319868 1329867 1339866 1349865 1359864 1369863 1379862 13898

# **Algoritmo visto a lezione:**
Implementazione algoritmo Luby per la colorazione del grafo.

##**Risultati:**
Con un grafo $\mathcal{G}(10000, 1)$ si ottiene: 26493.421875 ms

##**Problematica:**
Rispetto all'algoritmo implementato in maniera sequenziale risulta molto più lento e soprattutto trova come coloratura massima 9760, inferiore al numero di nodi.


In [312]:
%%cuda --name coloring.h
#pragma once

#include <curand_kernel.h>
#include "/content/drive/MyDrive/graphcoloring/graph/graph.h"
#include "/content/drive/MyDrive/graphcoloring/utils/common.h"

/**
 *  graph coloring struct (colors are: 1,2,3,..,k)
 */

struct Coloring {
	bool		uncoloredNodes;
	uint		numOfColors;
	uint	*	coloring;   // each element denotes a color
};

Coloring* LubyGreedy(GraphStruct*);
void printColoring (Coloring*, GraphStruct*, bool);
__global__ void LubyJPcolorer (Coloring*, GraphStruct*, uint*) ;
__global__ void init(uint seed, curandState_t*, uint*, uint);
__global__ void findIS (Coloring*, GraphStruct*, uint*);
__global__ void print_d(GraphStruct*, bool);

'File written in /content/src/coloring.h'

In [308]:
%%cuda --name Luby.cu


#include "coloring.h"
#include "/content/drive/MyDrive/graphcoloring/graph/graph_d.h"
#include "/content/drive/MyDrive/graphcoloring/utils/common.h"

using namespace std;

#define THREADxBLOCK 128

Coloring* LubyGreedy(GraphStruct *str) {
	// set coloring struct

	Coloring* col;
	gpuErrchk(cudaMallocManaged(&col, sizeof(Coloring)));
	uint n = str->nodeSize;
	col->uncoloredNodes = true;

	// cudaMalloc for arrays of struct Coloring
	gpuErrchk(cudaMallocManaged( &(col->coloring), n * sizeof(uint)));
	memset(col->coloring,0,n);

	// allocate space on the GPU for the random states
	curandState_t* states;
	uint* weigths;
	cudaMalloc((void**) &states, n * sizeof(curandState_t));
	cudaMalloc((void**) &weigths, n * sizeof(uint));
	dim3 threads ( THREADxBLOCK);
	dim3 blocks ((str->nodeSize + threads.x - 1) / threads.x, 1, 1 );
	uint seed = 0;
	init <<< blocks, threads >>> (seed, states, weigths, n);

	// start coloring (dyn. parall.)
	//LubyJPcolorer <<< 1, 1 >>> (col, str, weigths);

//#####################
	// loop on CPU

	// loop on ISs covering the graph
	col->numOfColors = 0;
	while (col->uncoloredNodes) {
		col->uncoloredNodes = false;
		col->numOfColors++;
		findIS <<< blocks, threads >>> (col, str, weigths);
		cudaDeviceSynchronize();
	}
//#####################


	cudaFree(states);
	cudaFree(weigths);
	return col;
}

/**
 * find an IS
 */
__global__ void findIS (Coloring* col, GraphStruct *str, uint* weights) {
	uint idx = threadIdx.x + blockDim.x * blockIdx.x;

	if (idx >= str->nodeSize)
		return;

	if (col->coloring[idx])
		return;

	uint offset = str->cumDegs[idx];
	uint deg = str->cumDegs[idx + 1] - str->cumDegs[idx];

	bool candidate = true;
	for (uint j = 0; j < deg; j++) {
		uint neighID = str->neighs[offset + j];
		if (!col->coloring[neighID] &&
				((weights[idx] < weights[neighID]) ||
				((weights[idx] == weights[neighID]) && idx < neighID))) {
			candidate = false;
		}
	}
	if (candidate) {
		col->coloring[idx] = col->numOfColors;
	}
	else
		col->uncoloredNodes = true;
}


/**
 *  this GPU kernel takes an array of states, and an array of ints, and puts a random int into each
 */
__global__ void init (uint seed, curandState_t* states, uint* numbers, uint n) {
	uint idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx > n)
			return;
	curand_init(seed, idx, 0, &states[idx]);
	numbers[idx] = curand(&states[idx])%n*n;
}


/**
 * Luby IS & Jones−Plassmann colorer
 */
__global__ void LubyJPcolorer (Coloring* col, GraphStruct *str, uint* weights) {
	dim3 threads (THREADxBLOCK);
	dim3 blocks ((str->nodeSize + threads.x - 1) / threads.x, 1, 1 );

	// loop on ISs covering the graph
	col->numOfColors = 0;
	while (col->uncoloredNodes) {
		col->uncoloredNodes = false;
		col->numOfColors++;
		findIS <<< blocks, threads >>> (col, str, weights);
		//cudaDeviceSynchronize();
	}
}



'File written in /content/src/Luby.cu'

In [317]:
%%cuda --name test_Luby.cu

#include "coloring.h"

int main(void) {
	unsigned int n = 10000;		 // number of nodes for random graphs
	float prob = 1;				    // density (percentage) for random graphs
	std::default_random_engine eng{0};  // fixed seed

  srand(time(0));
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

	// new graph with n nodes
	Graph graph(n,1);

	// generate a random graph
	graph.randGraph(prob,eng);

	// get the graph struct
	GraphStruct *str = graph.getStruct();

  //print_d <<< 1, 1 >>> (str, true);

  cudaEventRecord(start);

  Coloring* col = LubyGreedy(str);
  cudaDeviceSynchronize();

  cudaEventRecord(stop);
  cudaEventSynchronize(stop);

  //Stampo in millisecondi quanto tempo ci ha messo a colorare il grafo.
  float milliseconds = 0;
  cudaEventElapsedTime(&milliseconds, start, stop);
  printf("%f ms\n", milliseconds);

  int maxColor = 0;
  printf("Coloratura trovata: ");
  for(int i = 0; i < str->nodeSize; i++){
      if(maxColor < col->coloring[i]) maxColor = col->coloring[i];
      printf("%d ", col->coloring[i]);
  }
  printf("\nColore massimo: %d", maxColor+1);
  //printColoring(col, str, 1);

	return EXIT_SUCCESS;
}

'File written in /content/src/test_Luby.cu'

In [318]:
!nvcc -dc src/test_Luby.cu src/Luby.cu /content/drive/MyDrive/graphcoloring/graph/graph.cpp /content/drive/MyDrive/graphcoloring/graph/graph_d.cu
!nvcc test_Luby.o Luby.o graph.o graph_d.o -o testLuby
!./testLuby

26493.421875 ms
Coloratura trovata: 2122 1781 8007 5867 1478 866 5878 1311 6334 9736 3085 1206 1937 6139 8971 5337 8490 3346 4648 7352 3306 8340 922 7405 6506 5996 6127 261 4209 7496 5132 3390 6927 3853 4550 2711 9673 278 5021 7829 1594 6016 7915 5522 2965 8866 6419 5165 2779 1618 5239 8643 6857 2115 7553 353 3226 3742 5613 132 6452 7911 8544 7841 7522 8053 3699 668 5452 8547 6765 170 3064 5701 4680 5818 1533 6654 5852 7209 8955 4066 2207 3261 2566 4433 4781 8875 9596 6499 8954 4840 2146 5297 9653 9034 2746 3534 1619 5180 3553 9144 8094 4148 8246 4412 3174 137 2142 9452 647 7543 5681 9436 632 8072 3607 193 1130 1962 4618 6903 4863 8691 5145 7608 1578 6991 2616 992 2743 7252 7926 4195 953 6875 9714 5234 9756 692 8324 2603 6262 8021 4916 6488 679 2702 4178 1333 8136 3408 1103 1599 5761 3500 8876 9503 5307 3389 9158 465 1118 3233 8143 6261 6039 5834 5140 8076 3073 4481 1796 4518 2016 5142 8505 2141 9584 9507 4197 5645 6108 6807 1652 243 2943 3560 5558 7178 8689 742 1841 5838 3972 7407 791

# **Algoritmo Largest Degree First (LDF):**
Implementazione algoritmo Largest Degree First (LDF) per la colorazione del grafo.

# **Funzionamento:**
*   Costruzione del grafo $\mathcal{G}(n, prob)$, dove $n$ è il numero di nodi e $prob$ la probabilità di estrazione di un arco.
*   Assegnamento casuale dei pesi $w$ sull' insieme $\{0 .. n\}.$
*   (In parallelo) $∀i \in \{0 .. n\}$ il vertice $v_i$ viene selezionato come candidato se risulta essere un massimo locale utilizzado come metrica i gradi (nel caso di ugualianza si confrontano i pesi).
*   (In parallelo con stream?) Colorazione del candidato trovato al punto precedente con il primo colore disponibile.

# **Struttura:**
*   LDFColorer.cu → implementazione del colorer.
*   testerColorer.cu → costruzione del grafo e misurazione del tempo di esecuzione.

##**Risultati:**
Con un grafo $\mathcal{G}(10000, 1)$ si ottiene: 253779.406250 ms

##**Problematica:**
Rispetto all'algoritmo implementato in maniera sequenziale risulta MOLTO più lento.


In [298]:

%%cuda --name LDFColorer.cu

#include <cuda.h>
#include <iostream>
#include <curand_kernel.h>
#include <thrust/sequence.h>
#include <thrust/shuffle.h>
#include <thrust/random.h>
#include <thrust/count.h>

#include "/content/drive/MyDrive/graphcoloring/graph/coloring.h"
#include "/content/drive/MyDrive/graphcoloring/graph/graph.h"
#include "/content/drive/MyDrive/graphcoloring/graph/graph_d.h"
#include "/content/drive/MyDrive/graphcoloring/utils/common.h"

#define THREADxBLOCK 128
#define NSTREAM 4

using namespace std;

__global__ void findCandidate(Coloring* col, GraphStruct *str, bool * candidate){
    int n = str->nodeSize;
	  uint i = threadIdx.x + blockDim.x * blockIdx.x;
		bool flag = true; // vera sse il nodo ha peso locale massimo

    if (i >= n) return;

		// ignora i nodi già colorati
		if (col->coloring[i] != -1) return;

		int iWeight = str->weights[i];

		// guarda i pesi del vicinato
		uint offset = str->cumDegs[i];
		uint deg = str->cumDegs[i + 1] - str->cumDegs[i];

		for (uint j = 0; j < deg; j++) {
			uint neighID = str->neighs[offset + j];
      uint neighDeg = str->cumDegs[neighID + 1] - str->cumDegs[neighID];
			// ignora i vicini già colorati (e te stesso)
			int jColor = col->coloring[neighID];

      if(jColor != -1){
          col->usedColors[n * i + jColor] = true;
          continue;
      }
      if(i == neighID) continue;

			int jWeight = str->weights[neighID];

			if (deg < neighDeg){
          flag = false;
      }
      else if (deg == neighDeg){
          if(iWeight < jWeight){
              flag = false;
          }
      }


  }
    if(flag){
        candidate[i] = true;
      }
}

__global__ void colorer (Coloring* col, GraphStruct *str, bool* candidate, int offset) {
  uint i = threadIdx.x + blockDim.x * blockIdx.x;
  int n = str->nodeSize;

  if(candidate[i]){
        int color = 0;
        while (col->usedColors[n * (i+offset) + color]) color++;

        // Assegna il primo colore libero al nodo corrente
        col->coloring[i + offset] = color;
    }
}

Coloring* graphColoring(GraphStruct *str) {
	int n = str->nodeSize;
    int r = rand();



	Coloring* col;
	gpuErrchk(cudaMallocManaged(&col, sizeof(Coloring)));

    gpuErrchk(cudaMallocManaged(&(col->coloring), n * sizeof(int)));
	thrust::fill(col->coloring, col->coloring + n, -1);


    gpuErrchk(cudaMallocManaged(&(col->usedColors), n * n * sizeof(bool)));
    thrust::fill(col->usedColors, col->usedColors + (n * n), false);

    bool* candidate;
    gpuErrchk(cudaMallocManaged(&(candidate), n * sizeof(bool)));
    thrust::fill(candidate, candidate + n, false);

    // Generazione pesi
    thrust::sequence(str->weights, str->weights + n);
    thrust::default_random_engine g;
    thrust::shuffle(str->weights, str->weights + n, g);

 int iElem = n / NSTREAM;
	dim3 threads ( THREADxBLOCK);
	dim3 blocks ((n + threads.x - 1) / threads.x, 1, 1 );
  dim3 blocksS ((iElem + threads.x - 1) / threads.x, 1, 1);



    cudaStream_t stream[NSTREAM];

    for (int i = 0; i < NSTREAM; ++i)
      gpuErrchk(cudaStreamCreate(&stream[i]));

	for(int c = 0; c < n; c++){
    findCandidate<<<blocks, threads>>>(col, str, candidate);
    gpuErrchk(cudaPeekAtLastError());
		gpuErrchk(cudaDeviceSynchronize());
    for(int i=0; i < NSTREAM; ++i){
        int ioffset = i * iElem;
        colorer<<<blocksS, threads, 0, stream[i]>>>(col, str, &candidate[ioffset], ioffset);
    }
    gpuErrchk(cudaPeekAtLastError());
		gpuErrchk(cudaDeviceSynchronize());
    thrust::fill(candidate, candidate + n, false);



        int left = (int)thrust::count(col->coloring, col->coloring + n ,-1);
        if (left == 0){
            break;
        }
	}

  for (int i = 0; i < NSTREAM; ++i)
      gpuErrchk(cudaStreamDestroy(stream[i]));

    return col;
}

'File written in /content/src/LDFColorer.cu'

In [301]:
!nvcc -dc src/testerColorer.cu /content/src/LDFColorer.cu /content/drive/MyDrive/graphcoloring/graph/graph.cpp /content/drive/MyDrive/graphcoloring/graph/graph_d.cu
!nvcc testerColorer.o LDFColorer.o graph.o graph_d.o -o testerColorer
!./testerColorer

253779.406250 ms
Coloratura trovata: 3278 9244 2692 5381 5009 725 4826 3734 6548 9348 8930 5024 9530 6072 7182 7897 9057 5259 4396 7140 2675 6224 6124 8376 8400 2449 9582 6462 8336 9699 3787 7785 9314 5676 3716 3583 7662 6374 8725 7873 9626 6615 1127 6813 9549 648 5897 4004 8154 2655 1984 1215 9648 1195 6900 7736 4933 5660 5513 9767 1802 9382 2264 4097 5735 3203 9574 9437 2694 2208 108 5338 6417 7293 3020 2148 6307 5927 5633 9133 2263 3732 9551 8146 8468 8993 5807 3712 7316 8873 3392 7587 8951 6482 1909 7750 2214 3714 4872 6512 9855 1545 9510 8599 418 5786 3867 2003 7300 9395 6281 7956 7852 8849 6791 4168 5332 6013 963 2939 3159 4508 3987 8386 6684 9427 7452 3368 3737 3210 6889 5894 9804 4590 5762 788 6817 2181 3653 1579 5579 5553 5392 5809 975 4680 9328 2178 3290 515 6733 3085 3550 9763 1390 3640 4285 9668 2000 783 7105 5854 345 4095 7458 5137 7793 4692 1906 5954 9833 8928 6131 1364 1785 5185 7291 9965 7141 4656 9881 4286 3217 2151 8249 1800 2063 2883 257 1064 997 5091 4187 6776 7554 