#Lab1-Part1: Sequential Code

In [None]:
%%writefile sequential_kernel.cu

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

int main(int argc, char *argv[]){
    //Check that our input is correct
    if(argc != 4){
        printf("ERROR: The input should be of the following format.\n");
        printf("./sequential <input_file_path> <input_file_length> <output_file_path> \n");
        return -1;
    }

    int inputLength = atoi(argv[2]);

    //Try and open the file to make sure it exists
    FILE* fp1 = fopen(argv[1], "r");
    if(fp1 == NULL){
        printf("ERROR: File could not be found.");
        return -1;
    }

    //Seeing as the file exists we create a new file for the output
    FILE* fp2 = fopen(argv[3], "w+");
    if(fp2 == NULL){
        printf("ERROR: New file could not be created.");
    }


    //Setting up the gates outputs
    //This array is basically the same as the output matrix in the pdf
    //OR and NAND collumns
    //Row 1 corresponds to A=B=0, Row 2 corresponds to A=0 B=1, Row 3 corresponds to A=1 B=0, Row 4 corresponds to A=B=1
    //Collumns represent solutions for gatetypes based on A/B values
    //Col1=AND, Col2=OR, Col3=NAND, Col4=NOR, Col5=XOR, Col6=XNOR
    int output[4][6] = {{0,0,1,1,0,1},
                        {0,1,1,0,1,0},
                        {0,1,1,0,1,0},
                        {1,1,0,0,0,1}};
    int a = 0;
    int b = 0;
    int gateType = 0;
    
    //Lets record how long this takes
    clock_t begin = clock();

    for(int i = 0; i < inputLength; i++){
        //Grab A,B, and gateType for every line
        for(int j = 0; j < 6; j++){
            if(j == 0) a = fgetc(fp1) - 48;
            else if(j == 2) b = fgetc(fp1) - 48;
            else if(j == 4)gateType = fgetc(fp1) - 48;
            else fgetc(fp1);
        }
        //Based on A and B we find the corresponding gate in the outputMatrix and find the answer
        if(a == 0 && b == 0){
            fprintf(fp2, "%d\n", output[0][gateType]);
        }
        if(a == 0 && b == 1){
            fprintf(fp2, "%d\n", output[1][gateType]);
        }
        if(a == 1 && b == 0){
            fprintf(fp2, "%d\n", output[2][gateType]);
        }
        if(a == 1 && b == 1){
            fprintf(fp2, "%d\n", output[3][gateType]);
        }
    }

    clock_t end = clock();
    double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;

    printf("Total time for logic gate reading was: %f seconds", time_spent);

    fclose(fp2);
    fclose(fp1);
    return 0;
}

Overwriting sequential_kernel.cu


In [None]:
!nvcc sequential_kernel.cu -o sequential
!./sequential input_1000000.txt 1000000 output_1000000.txt

Total time for logic gate reading was: 0.124884 seconds

#Lab1-Part2: Explicit Memory Allocation

In [None]:
%%writefile explicit_kernel.cu


#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <time.h>

__global__ void gateAnalysis(int* arr_old, int* arr_new, int length, int COLS, int threadsPerBlock) {
    //Setting up the gates outputs
    //This array is basically the same as the output matrix in the pdf
    //OR and NAND collumns
    //Row 1 corresponds to A=B=0, Row 2 corresponds to A=0 B=1, Row 3 corresponds to A=1 B=0, Row 4 corresponds to A=B=1
    //Collumns represent solutions for gatetypes based on A/B values
    //Col1=AND, Col2=OR, Col3=NAND, Col4=NOR, Col5=XOR, Col6=XNOR
    int output[4][6] = { {0,0,1,1,0,1},
                        {0,1,1,0,1,0},
                        {0,1,1,0,1,0},
                        {1,1,0,0,0,1} };

    int index = threadIdx.x + blockIdx.x * blockDim.x;
    if(index < length){
    //for (int i = index; i < length; i++) {
        // array is flat now

        int a = arr_old[index * COLS + 0];
        int b = arr_old[index * COLS + 1];
        int gateType = arr_old[index * COLS + 2];
        if (a == 0 && b == 0) {
            arr_new[index] = output[0][gateType];
        }
        if (a == 0 && b == 1) {
            arr_new[index] = output[1][gateType];
        }
        if (a == 1 && b == 0) {
            arr_new[index] = output[2][gateType];
        }
        if (a == 1 && b == 1) {
            arr_new[index] = output[3][gateType];
        }
    }
}
int main(int argc, char* argv[]) {
    //Check that our input is correct
    if (argc != 4) {
        printf("ERROR: The input should be of the following format.\n");
        printf("./sequential <input_file_path> <input_file_length> <output_file_path> \n");
        return -1;
    }

    int inputLength = atoi(argv[2]);

    //Try and open the file to make sure it exists
    FILE* fp1 = fopen(argv[1], "r");
    if (fp1 == NULL) {
        printf("ERROR: File could not be found.");
        return -1;
    }

    //Seeing as the file exists we create a new file for the output
    FILE* fp2 = fopen(argv[3], "w+");
    if (fp2 == NULL) {
        printf("ERROR: New file could not be created.");
    }

    int COLS = 3;
    int THREADS = 6;
    //Now that we've done all the checks we can move on
    //Need to pass the GPU the data so we will make an array to hold our data
    int a = 0;
    int b = 0;
    int gateType = 0;
    int* inputMatrix = (int*)malloc(sizeof(int) * inputLength * COLS);
    int* outputMatrix = (int*)malloc(sizeof(int) * inputLength);


    for (int i = 0; i < inputLength; i++) {
        for (int j = 0; j < 6; j++) {
            if (j == 0) a = fgetc(fp1) - 48;
            else if (j == 2) b = fgetc(fp1) - 48;
            else if (j == 4)gateType = fgetc(fp1) - 48;
            else fgetc(fp1);

        }
        // array is flat now
        inputMatrix[i * COLS + 0] = a;
        inputMatrix[i * COLS + 1] = b;
        inputMatrix[i * COLS + 2] = gateType;
    }

    int* d_inputMatrix;
    int inputSize = sizeof(int) * inputLength * COLS;

    int* d_outputMatrix;
    int outputSize = sizeof(int) * inputLength;
    
    //Allocate space for the input matrix and output matrix
    cudaMalloc((void**)&d_inputMatrix, inputSize);
    cudaMalloc((void**)&d_outputMatrix, outputSize);

    //Transfer input matrix to gpu.
    clock_t begin_data = clock();
    cudaMemcpy(d_inputMatrix, inputMatrix, inputSize, cudaMemcpyHostToDevice);
    clock_t end_data = clock();
    double time_spent_data = (double)(end_data - begin_data) / CLOCKS_PER_SEC;

    printf("Total time for data transfer was: %f seconds\n", time_spent_data);

    clock_t begin = clock();

    gateAnalysis << <1000, inputLength / 1000 >> > (d_inputMatrix, d_outputMatrix , inputLength, COLS, THREADS / 1000);
    cudaDeviceSynchronize();

    clock_t end = clock();
    double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;

    cudaMemcpy(outputMatrix, d_outputMatrix, outputSize, cudaMemcpyDeviceToHost);

    //print solution array to file
    for (int row = 0; row < inputLength; row++) {
        // array is flat now
        fprintf(fp2, "%d\n", outputMatrix[row]);
    }
    

    printf("Total time for logic gate was: %f seconds\n", time_spent);

    //free memory and close file pointers
    free(inputMatrix);
    free(outputMatrix);
    cudaFree(d_inputMatrix);
    cudaFree(d_outputMatrix);
    fclose(fp2);
    fclose(fp1);
    return 0;
}


Overwriting explicit_kernel.cu


In [None]:
!nvcc explicit_kernel.cu -o parallel_explicit
!./parallel_explicit input_1000000.txt 1000000 output_1000000.txt

Total time for data transfer was: 0.002699 seconds
Total time for logic gate was: 0.000346 seconds


#Lab1-Part3: Unified Memory Allocation

In [None]:
%%writefile unified_kernel.cu

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <time.h>

__global__ void gateAnalysis(int* arr_old, int* arr_new, int length, int COLS, int threadsPerBlock) {
    //Setting up the gates outputs
    //This array is basically the same as the output matrix in the pdf
    //OR and NAND collumns
    //Row 1 corresponds to A=B=0, Row 2 corresponds to A=0 B=1, Row 3 corresponds to A=1 B=0, Row 4 corresponds to A=B=1
    //Collumns represent solutions for gatetypes based on A/B values
    //Col1=AND, Col2=OR, Col3=NAND, Col4=NOR, Col5=XOR, Col6=XNOR
    int output[4][6] = { {0,0,1,1,0,1},
                        {0,1,1,0,1,0},
                        {0,1,1,0,1,0},
                        {1,1,0,0,0,1} };

    int index = threadIdx.x + blockIdx.x * blockDim.x;
    if (index < length) {
        //for (int i = index; i < length; i++) {
            // array is flat now

        int a = arr_old[index * COLS + 0];
        int b = arr_old[index * COLS + 1];
        int gateType = arr_old[index * COLS + 2];
        if (a == 0 && b == 0) {
            arr_new[index] = output[0][gateType];
        }
        if (a == 0 && b == 1) {
            arr_new[index] = output[1][gateType];
        }
        if (a == 1 && b == 0) {
            arr_new[index] = output[2][gateType];
        }
        if (a == 1 && b == 1) {
            arr_new[index] = output[3][gateType];
        }
    }
}
int main(int argc, char* argv[]) {
    //Check that our input is correct
    if (argc != 4) {
        printf("ERROR: The input should be of the following format.\n");
        printf("./sequential <input_file_path> <input_file_length> <output_file_path> \n");
        return -1;
    }

    int inputLength = atoi(argv[2]);

    //Try and open the file to make sure it exists
    FILE* fp1 = fopen(argv[1], "r");
    if (fp1 == NULL) {
        printf("ERROR: File could not be found.");
        return -1;
    }

    //Seeing as the file exists we create a new file for the output
    FILE* fp2 = fopen(argv[3], "w+");
    if (fp2 == NULL) {
        printf("ERROR: New file could not be created.");
    }

    int COLS = 3;
    int THREADS = 6;
    //Now that we've done all the checks we can move on
    //Need to pass the GPU the data so we will make an array to hold our data
    int a = 0;
    int b = 0;
    int gateType = 0;

    int* inputMatrix;
    int inputSize = sizeof(int) * inputLength * COLS;

    int* outputMatrix;
    int outputSize = sizeof(int) * inputLength;

    //Allocate space for the input matrix and output matrix
    cudaMallocManaged((void**)&inputMatrix, inputSize);
    cudaMallocManaged((void**)&outputMatrix, outputSize);

    for (int i = 0; i < inputLength; i++) {
        for (int j = 0; j < 6; j++) {
            if (j == 0) a = fgetc(fp1) - 48;
            else if (j == 2) b = fgetc(fp1) - 48;
            else if (j == 4)gateType = fgetc(fp1) - 48;
            else fgetc(fp1);

        }
        // array is flat now
        inputMatrix[i * COLS + 0] = a;
        inputMatrix[i * COLS + 1] = b;
        inputMatrix[i * COLS + 2] = gateType;
    }


    //Now run parallelization and time it
    clock_t begin = clock();

    gateAnalysis << <1000, inputLength / 1000 >> > (inputMatrix, outputMatrix, inputLength, COLS, THREADS / 1000);
    cudaDeviceSynchronize();

    clock_t end = clock();
    double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;

    //print solution array to file
    for (int row = 0; row < inputLength; row++) {
        // array is flat now
        fprintf(fp2, "%d\n", outputMatrix[row]);
    }
    
    printf("Total time was: %f seconds\n", time_spent);

    //free memory and close file pointers
    cudaFree(inputMatrix);
    cudaFree(outputMatrix);
    fclose(fp2);
    fclose(fp1);
    return 0;
}


Overwriting unified_kernel.cu


In [None]:
!nvcc unified_kernel.cu -o parallel_unified
!./parallel_unified input_1000000.txt 1000000 output_1000000.txt

Total time was: 0.003800 seconds


#Comparison

In [None]:
!nvcc comparison.cu -o comparison
!./comparison output_100000.txt sol_100000.txt

Total Errors : 0	

In [None]:
%%writefile comparison.cu
#include <stdio.h>
#include <stdlib.h>


void compareFiles(char *file_name1, char *file_name2) 
{ 
//get from https://www.tutorialspoint.com/c-program-to-compare-two-files-and-report-mismatches
FILE* fp1 = fopen(file_name1, "r");
FILE* fp2 = fopen(file_name2, "r");
    // fetching character of two file 
    // in two variable ch1 and ch2 
    char ch1 = getc(fp1); 
    char ch2 = getc(fp2); 
  
    // error keeps track of number of errors 
    // pos keeps track of position of errors 
    // line keeps track of error line 
    int error = 0, pos = 0, line = 1; 
  
    // iterate loop till end of file 
    while (ch1 != EOF && ch2 != EOF) 
    { 
        pos++; 
  
        // if both variable encounters new 
        // line then line variable is incremented 
        // and pos variable is set to 0 
        if (ch1 == '\n' && ch2 == '\n') 
        { 
            line++; 
            pos = 0; 
        } 
  
        // if fetched data is not equal then 
        // error is incremented 
        if (ch1 != ch2) 
        { 
            error++; 
            printf("Line Number : %d \tError"
               " Position : %d \n", line, pos); 
        } 
  
        // fetching character until end of file 
        ch1 = getc(fp1); 
        ch2 = getc(fp2); 
    } 
  
    printf("Total Errors : %d\t", error); 
} 

int main(int argc, char *argv[]){

    if( argc < 3) {
      printf("Require two files\n");
      exit(1);
      
   }
compareFiles(argv[1], argv[2]);
}

Writing comparison.cu
