Skip to content
This repository has been archived by the owner on Aug 10, 2021. It is now read-only.

SIMD is not enabled? #2660

Closed
MyErpSoft opened this issue Feb 14, 2019 · 5 comments
Closed

SIMD is not enabled? #2660

MyErpSoft opened this issue Feb 14, 2019 · 5 comments

Comments

@MyErpSoft
Copy link

package sample

//import kotlin.random.Random
import kotlin.system.measureNanoTime

fun hello(): String = "Hello, Kotlin/Native!"

fun main() {
    runIt()
}

private fun runIt(){
    var sum = 0
    val time = measureNanoTime{
        //val ran = Random.Default
        for (i in 0 until 1_0000_0000){
            //val v = ran.nextInt()
            sum += getInt32TrueCount2(i)
        }
    }

    println("time:$time ns, result: $sum")
}

private fun getInt32TrueCount(value: Int):Int {
    if (value == 0) {
        return 0
    }

    return getByteTrueCount(value and 0xFF) +
            getByteTrueCount((value shr 8) and 0xFF) +
            getByteTrueCount((value shr 16) and 0xFF) +
            getByteTrueCount((value shr 24) and 0xFF)
}

private fun getByteTrueCount(value: Int) : Int{
    if(value== 0){
        return 0
    }

    val a = (value and 0x1)
    val b = ((value and 0x2)  shr 1)
    val c = ((value and 0x4)  shr 2)
    val d = ((value and 0x8)  shr 3)
    val e = ((value and 0x10) shr 4)
    val f = ((value and 0x20) shr 5)
    val g = ((value and 0x40) shr 6)
    val h = ((value and 0x80) shr 7)

    return a + b + c + d + e + f + g + h
}

private fun getInt32TrueCount2(value: Int):Int {
    if (value == 0) {
        return 0
    }

    return getByteTrueCount2(value and 0xFF) +
            getByteTrueCount2((value shr 8) and 0xFF) +
            getByteTrueCount2((value shr 16) and 0xFF) +
            getByteTrueCount2((value shr 24) and 0xFF)
}

private val m1 = intArrayOf(0x1,0x2,0x4,0x8,0x10,0x20,0x40,0x80)
private val m2 = intArrayOf(0,1,2,3,4,5,6,7)
private val m3 = IntArray(8)

private fun getByteTrueCount2(value : Int) : Int{
    if (value == 0) {
        return 0
    }

    var sum = 0
    for (i in 0 until m1.size){
        sum += (value and m1[i] shr i)
    }

    return sum
}

Kotlin Native : ~ 80 s
Kotlin JVM : ~ 1.3 s

Windows 10 64 位
Intel Core i5-6500 @3.2GHz 4 Core
16GB RAM
Kotlin native 1.3.21

Why is the getByteTrueCount2 method so slow?

@MyErpSoft
Copy link
Author

Bound checking elimination #1791

@SvyatoslavScherbina
Copy link
Collaborator

for (i in 0 until m1.size){
sum += (value and m1[i] shr i)

m1 is global variable, and accesses to global variables are noticeably more expensive than to local ones.

Extracting m1 to a local variable should help a bit.

Also please note: https://github.com/JetBrains/kotlin-native/blob/master/RELEASE_NOTES.md#performance

@olonho
Copy link
Contributor

olonho commented Feb 14, 2019

If goal is to write faster code, it makes sense to do it in the way suitable for the platform. In your case program with following modifications

package sample

import sample.interop1.*
import kotlin.system.measureNanoTime
import kotlin.random.Random

private fun runIt() {
    var sum = 0
    val time = measureNanoTime {
        //val ran = Random.Default
        for (i in 0 until 1_0000_0000){
            //val v = ran.nextInt()
            sum += getInt32TrueCount2(i)
        }
    }

    println("time:${time.toFloat() / 1_000_000_000} s, result: $sum")
}

private fun runItFaster() {
    var sum = 0
    val time = measureNanoTime {
        for (i in 0 until 1_0000_0000){
            sum += popcount(i)
        }
    }

    println("time:${time.toFloat() / 1_000_000_000} s, result: $sum")
}

private fun getInt32TrueCount(value: Int):Int {
    if (value == 0) {
        return 0
    }

    return getByteTrueCount(value and 0xFF) +
            getByteTrueCount((value shr 8) and 0xFF) +
            getByteTrueCount((value shr 16) and 0xFF) +
            getByteTrueCount((value shr 24) and 0xFF)
}

private fun getByteTrueCount(value: Int) : Int{
    if(value== 0){
        return 0
    }

    val a = (value and 0x1)
    val b = ((value and 0x2)  shr 1)
    val c = ((value and 0x4)  shr 2)
    val d = ((value and 0x8)  shr 3)
    val e = ((value and 0x10) shr 4)
    val f = ((value and 0x20) shr 5)
    val g = ((value and 0x40) shr 6)
    val h = ((value and 0x80) shr 7)

    return a + b + c + d + e + f + g + h
}

private fun getInt32TrueCount2(value: Int):Int {
    if (value == 0) {
        return 0
    }

    return getByteTrueCount2(value and 0xFF) +
            getByteTrueCount2((value shr 8) and 0xFF) +
            getByteTrueCount2((value shr 16) and 0xFF) +
            getByteTrueCount2((value shr 24) and 0xFF)
}

private val m1 = intArrayOf(0x1,0x2,0x4,0x8,0x10,0x20,0x40,0x80)
private val m2 = intArrayOf(0,1,2,3,4,5,6,7)
private val m3 = IntArray(8)

private fun getByteTrueCount2(value : Int) : Int{
    if (value == 0) {
        return 0
    }

    var sum = 0
    for (i in 0 until m1.size){
        sum += (value and m1[i] shr i)
    }

    return sum
}

fun main() {
    runIt()
    runItFaster()
}

And with following .def file

package=sample.interop1
---
#include <stdint.h>
int32_t popcount(int32_t x) {
   return __builtin_popcount(x);
}

prints on my machine

time:16.690422 s, result: 1314447104
time:0.3516291 s, result: 1314447104

@olonho
Copy link
Contributor

olonho commented Feb 14, 2019

Same code on JVM produces

time:1.0619177 s, result: 1314447104

which makes Native about 3 times faster on that task.

@MyErpSoft
Copy link
Author

Cool!
I learned something new. Thank you.

Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants