# Schwefel Function Optimization

## 1. CUDA Parallel Implementation

In [37]:
%%writefile schwefel_cuda.cu
#include <iostream>
#include <cmath>
#include <curand_kernel.h>
#include <chrono>
#include <cfloat>
#include <algorithm>
#include <fstream>
using namespace std;

// ================= PARAMETERS =================
#define POP 256
#define DIM 5
#define MAX_IT 8000
#define LB -500.0
#define UB 500.0

// ================= DEVICE RANDOM =================
__device__ double randF(curandState* st,double a,double b){
    return a + curand_uniform_double(st)*(b-a);
}
__device__ int randInt(curandState* st,int a,int b){
    return (int)(a + curand_uniform(st)*(b-a+0.99999f));
}

// ================= SCHWEFEL FITNESS =================
__device__ double schwefel(const double* x){
    double sum = 0.0;
    for(int i = 0; i < DIM; i++)
        sum += x[i] * sin(sqrt(abs(x[i])));
    return 418.9829 * DIM - sum;
}

// ================= INIT POP =================
__global__ void initPop(double* pop, curandState* state, unsigned long seed){
    int id = threadIdx.x;
    if(id < POP){
        curand_init(seed,id,0,&state[id]);
        for(int d=0;d<DIM;d++)
            pop[id*DIM+d] = randF(&state[id],LB,UB);
    }
}

// ================= EVAL =================
__global__ void evalFit(const double* pop,double* fit){
    int id=threadIdx.x;
    if(id<POP){
        double x[DIM];
        for(int d=0;d<DIM;d++) x[d]=pop[id*DIM+d];
        fit[id] = schwefel(x);
    }
}

// ================= LOA UPDATE =================
__global__ void LOA(double* pop,double* fit,double* np,double* nf,curandState* st,int t){
    int id=threadIdx.x; if(id>=POP) return;

    curandState local=st[id];
    int betterCnt=0,bestIdx=-1;

    for(int j=0;j<POP;j++)
        if(fit[j]<fit[id]) betterCnt++;

    if(betterCnt>0){
        int pick=randInt(&local,0,betterCnt-1),c=0;
        for(int j=0;j<POP;j++){
            if(fit[j]<fit[id]){
                if(c==pick){ bestIdx=j; break; }
                c++;
            }
        }
    }

    double cur[DIM],cand[DIM];
    for(int d=0;d<DIM;d++) cur[d]=pop[id*DIM+d];

    double pr=curand_uniform_double(&local);

    if(pr<0.5 && bestIdx!=-1){
        for(int d=0;d<DIM;d++){
            double r=curand_uniform_double(&local);
            int I=randInt(&local,1,2);
            double ssa=pop[bestIdx*DIM+d];
            cand[d]=cur[d]+r*(ssa-I*cur[d]);
            cand[d]=fmax(LB,fmin(UB,cand[d]));
        }
    }else{
        for(int d=0;d<DIM;d++){
            double r=curand_uniform_double(&local);
            cand[d]=cur[d]+(1-2*r)*(UB-LB)/t;
            cand[d]=fmax(LB,fmin(UB,cand[d]));
        }
    }

    double fv = schwefel(cand);

    if(fv<fit[id]){
        nf[id]=fv;
        for(int d=0;d<DIM;d++) np[id*DIM+d]=cand[d];
    }else{
        nf[id]=fit[id];
        for(int d=0;d<DIM;d++) np[id*DIM+d]=cur[d];
    }

    st[id]=local;
}

// ================= MAIN =================
int main(){

    auto start=chrono::high_resolution_clock::now();

    double *d_pop,*d_fit,*d_new,*d_newfit;
    curandState *d_state;

    cudaMalloc(&d_pop,POP*DIM*sizeof(double));
    cudaMalloc(&d_fit,POP*sizeof(double));
    cudaMalloc(&d_new,POP*DIM*sizeof(double));
    cudaMalloc(&d_newfit,POP*sizeof(double));
    cudaMalloc(&d_state,POP*sizeof(curandState));

    // CSV logger
    ofstream csv("schwefel_cuda_log.csv");
    csv<<"iter,best";
    for(int i=1;i<=DIM;i++) csv<<",x"<<i;
    csv<<"\n";

    initPop<<<1,POP>>>(d_pop,d_state,time(NULL));
    cudaDeviceSynchronize();

    evalFit<<<1,POP>>>(d_pop,d_fit);
    cudaDeviceSynchronize();

    double hfit[POP],hpop[POP*DIM];
    double best=DBL_MAX;

    for(int it=1;it<=MAX_IT;it++){

        LOA<<<1,POP>>>(d_pop,d_fit,d_new,d_newfit,d_state,it);
        cudaDeviceSynchronize();

        swap(d_pop,d_new);
        swap(d_fit,d_newfit);

        evalFit<<<1,POP>>>(d_pop,d_fit);
        cudaDeviceSynchronize();

        cudaMemcpy(hfit,d_fit,POP*sizeof(double),cudaMemcpyDeviceToHost);
        cudaMemcpy(hpop,d_pop,POP*DIM*sizeof(double),cudaMemcpyDeviceToHost);

        int bi=0;
        for(int i=1;i<POP;i++) if(hfit[i]<hfit[bi]) bi=i;
        best=min(best,hfit[bi]);

        // CSV log (every iteration)
        csv<<it<<","<<best;
        for(int d=0;d<DIM;d++) csv<<","<<hpop[bi*DIM+d];
        csv<<"\n";

        if(it%1000==0) cout<<"Iter "<<it<<" | Best = "<<best<<"\n";
    }

    csv.close();
    cout<<"\nCSV Saved -> schwefel_cuda_log.csv\n";

    cout<<"\nBest Found = "<<best<<"\n";
    auto end=chrono::high_resolution_clock::now();
    cout<<"CUDA Time = "<<chrono::duration<double>(end-start).count()<<" sec\n";
}


Overwriting schwefel_cuda.cu


In [38]:
!nvcc -arch=sm_75 schwefel_cuda.cu -o schwefel_cuda
!./schwefel_cuda

Iter 1000 | Best = 0.0351766
Iter 2000 | Best = 0.000136063
Iter 3000 | Best = 6.38673e-05
Iter 4000 | Best = 6.36921e-05
Iter 5000 | Best = 6.36443e-05
Iter 6000 | Best = 6.36437e-05
Iter 7000 | Best = 6.36437e-05
Iter 8000 | Best = 6.36427e-05

CSV Saved -> schwefel_cuda_log.csv

Best Found = 6.36427e-05
CUDA Time = 1.47291 sec


## 2. Serial (Sequential) Implementation

In [33]:
%%writefile schwefel_serial.cpp
#include <bits/stdc++.h>
using namespace std;

/* ============== Random ============== */
mt19937 rng(time(NULL));
double randF(double a,double b){
    uniform_real_distribution<double> dist(a,b);
    return dist(rng);
}
int randInt(int a,int b){
    uniform_int_distribution<int> dist(a,b);
    return dist(rng);
}

/* ============== Fitness (exact OMP) ============== */
double schwefel(const vector<double> &x){
    double sum = 0.0;
    for(int i=0;i<x.size();i++)
        sum += x[i] * sin(sqrt(abs(x[i])));     // SAME as OMP
    return 418.9829 * x.size() - sum;
}

/* ============== Params (unchanged) ============== */
int POP=256, DIM=5, MAX_IT=8000;
double LB=-500.0, UB=500.0;

/* ============== Escape Step ============== */
vector<double> escape(const vector<double> &x,const vector<double> &SSA){
    vector<double> newX=x;
    for(int j=0;j<DIM;j++){
        double r=randF(0,1);
        int I=randInt(1,2);
        newX[j]=x[j] + r*(SSA[j] - I*x[j]);
        newX[j]=min(max(newX[j],LB),UB);
    }
    return newX;
}

/* ============== Hide Step ============== */
vector<double> hide(const vector<double> &Xi,int t){
    vector<double> newX=Xi;
    for(int j=0;j<DIM;j++){
        double r=randF(0,1);
        newX[j]=Xi[j] + (1-2*r)*(UB-LB)/t;
        newX[j]=min(max(newX[j],LB),UB);
    }
    return newX;
}

/* ============== Init pop ============== */
vector<vector<double>> init_population(){
    vector<vector<double>> pop(POP,vector<double>(DIM));
    for(int i=0;i<POP;i++)
        for(int d=0;d<DIM;d++)
            pop[i][d]=randF(LB,UB);
    return pop;
}

/* ============== MAIN (FULLY SEQUENTIAL) ============== */
int main(){

    auto t1=chrono::high_resolution_clock::now();

    vector<vector<double>> pop = init_population();
    vector<double> fitness(POP);

    for(int i=0;i<POP;i++) fitness[i]=schwefel(pop[i]);

    double bestFit=DBL_MAX;
    vector<double> bestSol(DIM);

    for(int i=0;i<POP;i++)
        if(fitness[i]<bestFit){ bestFit=fitness[i]; bestSol=pop[i]; }

    for(int it=1; it<=MAX_IT; it++){

        for(int i=0;i<POP;i++){

            vector<int> better;
            for(int j=0;j<POP;j++)
                if(fitness[j]<fitness[i]) better.push_back(j);

            int betterIdx=-1;
            if(!better.empty())
                betterIdx = better[randInt(0,(int)better.size()-1)];

            vector<double> candidate;

            if(betterIdx!=-1 && randF(0,1)<0.5)
                candidate = escape(pop[i],pop[betterIdx]);
            else
                candidate = hide(pop[i],it);

            double f=schwefel(candidate);

            if(f<fitness[i]){
                pop[i]=candidate;
                fitness[i]=f;
                if(f<bestFit){
                    bestFit=f;
                    bestSol=candidate;
                }
            }
        }

        if(it%1000==0)
            cout<<"Iter "<<it<<" | Best = "<<bestFit<<"\n";
    }

    cout<<"\nFinal Best Solution:\n";
    for(int d=0;d<DIM;d++)
        cout<<"x"<<d+1<<" = "<<bestSol[d]<<"\n";

    cout<<"\nBest Schwefel Value = "<<bestFit<<"\n";

    auto t2=chrono::high_resolution_clock::now();
    cout<<"\nExecution Time = "<<chrono::duration<double>(t2-t1).count()<<" sec\n";
}


Overwriting schwefel_serial.cpp


In [34]:
!g++ schwefel_serial.cpp -o schwefel_serial
!./schwefel_serial

Iter 1000 | Best = 0.000199775
Iter 2000 | Best = 6.46465e-05
Iter 3000 | Best = 6.37647e-05
Iter 4000 | Best = 6.36704e-05
Iter 5000 | Best = 6.36694e-05
Iter 6000 | Best = 6.36578e-05
Iter 7000 | Best = 6.36578e-05
Iter 8000 | Best = 6.36578e-05

Final Best Solution:
x1 = 420.969
x2 = 420.968
x3 = 420.969
x4 = 420.969
x5 = 420.969

Best Schwefel Value = 6.36578e-05

Execution Time = 15.0554 sec


## 3. OpenMP Parallel Implementation

In [35]:

%%writefile schwefel_omp.cpp
#include <bits/stdc++.h>
#include <omp.h>
using namespace std;

/* ---------------------------------------
      Thread-safe Random Generator
---------------------------------------*/
double randF(mt19937 &rng, double a, double b) {
    uniform_real_distribution<double> dist(a, b);
    return dist(rng);
}
int randInt(mt19937 &rng, int a, int b) {
    uniform_int_distribution<int> dist(a, b);
    return dist(rng);
}

/* ---------------------------------------
            Fitness Function
---------------------------------------*/
double schwefel(const vector<double> &x) {
    int dim = x.size();

    double sum = 0.0;
    for (int i = 0; i < dim; ++i) {
        sum += x[i] * sin(sqrt(abs(x[i])));
    }
    return 418.9829 * dim - sum;

}

/* ---------------------------------------
            LOA PARAMETERS
---------------------------------------*/
int POP = 256, DIM = 5, MAX_IT = 8000;
double LB = -500.0, UB = 500.0;

/* ---------------------------------------
     Escape (Global Search)  — parallel safe
---------------------------------------*/
vector<double> escape(const vector<double> &x,
                      const vector<double> &SSA,
                      mt19937 &rng)
{
    vector<double> newX = x;
    for(int j=0;j<DIM;j++){
        double r = randF(rng,0,1);
        int I = randInt(rng,1,2);
        newX[j] = x[j] + r * (SSA[j] - I*x[j]);

        newX[j] = min(max(newX[j],LB),UB);
    }
    return newX;
}

/* ---------------------------------------
     Hide (Local Search) — parallel safe
---------------------------------------*/
vector<double> hide(const vector<double> &Xi,int t,mt19937 &rng){
    vector<double> newX = Xi;

    for(int j=0;j<DIM;j++){
        double r = randF(rng,0,1);
        newX[j] = Xi[j] + (1 - 2*r)*(UB-LB)/t;
        newX[j] = min(max(newX[j],LB),UB);
    }
    return newX;
}

/* ---------------------------------------
         Initialize population (PARALLEL)
---------------------------------------*/
vector<vector<double>> init_population(vector<mt19937> &rngs){
    vector<vector<double>> pop(POP, vector<double>(DIM));

    #pragma omp parallel
    {
        int tid = omp_get_thread_num();
        mt19937 &local_rng = rngs[tid];

        #pragma omp for schedule(static)
        for(int i=0;i<POP;i++)
            for(int d=0;d<DIM;d++)
                pop[i][d] = randF(local_rng,LB,UB);
    }
    return pop;
}

/* =======================================
             MAIN LOA PARALLEL
=======================================*/
int main(){
    auto t1 = chrono::high_resolution_clock::now();

    int threads = omp_get_max_threads();
    vector<mt19937> rngs(threads);

    random_device rd;
    for(int i=0;i<threads;i++)
        rngs[i].seed(rd()+i*111);

    vector<vector<double>> pop = init_population(rngs);
    vector<double> fitness(POP);

    // Initial fitness
    for(int i=0;i<POP;i++) fitness[i]=schwefel(pop[i]);

    double bestFit = 1e18;
    vector<double> bestSol(DIM);

    // find initial best
    for(int i=0;i<POP;i++){
        if(fitness[i]<bestFit){
            bestFit=fitness[i];
            bestSol=pop[i];
        }
    }

    /* --------------------------------------
                LOA ITERATIONS
       Full population parallel every step
    ---------------------------------------*/
    for(int it=1; it<=MAX_IT; it++){

        #pragma omp parallel
        {
            int tid = omp_get_thread_num();
            mt19937 &localRng = rngs[tid];

            double localBest = 1e18;
            vector<double> localBestSol(DIM);

            #pragma omp for schedule(static)
            for(int i=0;i<POP;i++){

                vector<int> better;
                for(int j=0;j<POP;j++)
                    if(fitness[j]<fitness[i]) better.push_back(j);

                int betterIdx=-1;
                if(!better.empty())
                    betterIdx = better[randInt(localRng,0,(int)better.size()-1)];

                vector<double> candidate;
                if(randF(localRng,0,1)<0.5 && betterIdx!=-1)
                    candidate = escape(pop[i],pop[betterIdx],localRng);
                else
                    candidate = hide(pop[i],it,localRng);

                double f = schwefel(candidate);

                if(f<fitness[i]){
                    pop[i]=candidate;
                    fitness[i]=f;
                }
                if(f<localBest){
                    localBest=f;
                    localBestSol=candidate;
                }
            }

            // Update global best safely
            #pragma omp critical
            {
                if(localBest<bestFit){
                    bestFit=localBest;
                    bestSol=localBestSol;
                }
            }
        }

        if(it % 1000 == 0)
            cout<<"Iter "<<it<<" | Best = "<<bestFit<<"\n";
    }

    cout<<"\nFinal Best Solution:\n";
    for(int i=0;i<DIM;i++) cout<<"x"<<i+1<<" = "<<bestSol[i]<<endl;
    cout<<"\nBest " << "Schwefel" << " Value = "<<bestFit<<endl;

    auto t2 = chrono::high_resolution_clock::now();
    cout<<"\nExecution Time = "
        <<chrono::duration<double>(t2-t1).count()
        <<" sec\n";

    return 0;
}


Overwriting schwefel_omp.cpp


In [36]:
!g++ -fopenmp schwefel_omp.cpp -o schwefel_omp
!./schwefel_omp

Iter 1000 | Best = 0.0169311
Iter 2000 | Best = 0.000100145
Iter 3000 | Best = 6.3945e-05
Iter 4000 | Best = 6.36754e-05
Iter 5000 | Best = 6.36454e-05
Iter 6000 | Best = 6.36422e-05
Iter 7000 | Best = 6.36407e-05
Iter 8000 | Best = 6.36397e-05

Final Best Solution:
x1 = 420.969
x2 = 420.969
x3 = 420.969
x4 = 420.969
x5 = 420.969

Best Schwefel Value = 6.36397e-05

Execution Time = 12.9668 sec
