Double.spin2

{{
        Double -- double (and single) precision floating point routines for
	the Propeller

        Copyright (c) 2012-2019 Total Spectrum Software Inc.

        Released under the MIT License (see the end of this file for details)      
}}
'#define DEBUG

''
'' define EXACT if results should be exactly correct; otherwise they may be
'' wrong in the last digit (up to 0.5 ulp error)
''
#define EXACT

CON
  FLAG_SIGN      = $1
  FLAG_ZERO      = $2
  FLAG_INF	 = $4
  FLAG_NAN       = $8
  FLAG_STICKY	 = $10
  FLAG_DOUBLE    = $20
  
  FBIAS_EXP	 = 127
  FMAX_EXP	 = 255

  DBIAS_EXP	 = 1023
  DMAX_EXP	 = $7ff
'' mask for double significand (high word)
  DMANTMASK	 = $000FFFFF
'' mask for float significand
  FMANTMASK   	 = $007FFFFF

  one_4_28	 = $1000_0000
  NAN_32	 = $7fc0_0000
  
DAT

'----------------------------
' Assembly language routines
'----------------------------

'----------------------------
' Main control loop
'----------------------------

#define r0	x10
#define	r1	x11
#define	r2	x12
#define	r3	x13

		org     $1d0

A		res	1
Alo		res	1
Aflag		res	1
Aexp		res	1

B		res	1
Blo		res	1
Bflag		res	1
Bexp		res	1

C		res	1
Clo		res	1
Cflag		res	1
Cexp		res	1

Alo2		res	1
Alo3		res	1

tmp0		res	1
tmp1		res	1
tmp2		res	1
count		res	1



		fit	$1f0

		orgh

		''
		'' code to unpack a double in A, Alo
		'' the IEEE format is 1 bit sign, 11 bit exponent,
		'' then 52 bit significand
		'' The unpacked significand is two longs, 4.28 + 0.32
		'' Exponent goes in Aexp, flags in Aflag
		''
DUnpack
		mov	Aflag, #FLAG_DOUBLE
		mov	Aexp, A
		shl	Aexp, #1 wc
	if_c	or	Aflag, #FLAG_SIGN
		and	A, ##DMANTMASK	' mask off exponent and sign bit
		shr	Aexp, #21 wz	' extract exponent
	if_z	jmp	#_Ddenorm	' zero or denormal
		cmp	Aexp, ##DMAX_EXP wz
		sub	Aexp, ##DBIAS_EXP  ' remove bias
	if_z	jmp	#_Dnan	      	' NaN or Infinity

		'' now shift up to 4.28 to give head room
		'' we start with 1.20
		mov	tmp0, Alo
		shl	A, #8
		shl	Alo, #8
		shr	tmp0, #24
		or	A, tmp0
		or	A, ##one_4_28	'' or in implied one
DUnpack_ret
		ret

		'' normalize a denormalized number
_Ddenorm
		sub	Aexp, ##DBIAS_EXP
		'' adjust for converting from 1.52 to 1.60
		add	Aexp, #(1+8)
		'' check for all 0
		mov	 pa, A
		or 	 pa, Alo wz
	if_z	sub	 Aexp, #64
	if_z	or	 Aflag, #FLAG_ZERO
	if_z	ret
		'' not all 0, renormalize
		jmp    #Normalize

		'' handle NaN or Infinity
_Dnan
		mov	Aexp, ##DMAX_EXP
		mov	pa, A
		or	pa, Alo wz	'' check for infinity
	if_z	or	Aflag, #FLAG_INF
	if_z	mov	A, ##one_4_28
	if_nz	or	Aflag, #FLAG_NAN
	if_nz	add	Aexp, Aexp
		ret

		''
		'' code to unpack a single precision float in A
		'' the IEEE format is 1 bit sign, 8 bit exponent,
		'' then 23 bit significand
		''
FUnpack
		mov	Alo, #0
		mov	Aflag, #0
		mov	Aexp, A
		shl	Aexp, #1 wc
	if_c	or	Aflag, #FLAG_SIGN
		and	A, ##FMANTMASK	' mask off exponent and sign bit
		shr	Aexp, #24 wz	' extract exponent
	if_z	jmp	#_Fdenorm	' zero or denormal
		cmp	Aexp, #FMAX_EXP wz
		sub	Aexp, #FBIAS_EXP  ' remove bias
	if_z	jmp	#_Fnan	      	' NaN or Infinity

		'' now shift up to 4.28 to give head room
		'' we start with 1.23
		shl	A, #5
	_ret_	or	A, ##one_4_28	'' or in implied one

		'' handle NaN or Infinity
_Fnan
		mov	Aexp, #FMAX_EXP
		mov	pa, A
		or	pa, Alo wz
	if_z	or	Aflag, #FLAG_INF
	if_z	mov	A, ##one_4_28
	if_nz	or	Aflag, #FLAG_NAN
	if_nz	add	Aexp, Aexp
		ret

		'' normalize a denormalized number
_Fdenorm
		sub	Aexp, #(FBIAS_EXP-1)
		'' adjust for converting from 1.23 to 1.28
		'' and check for all 0
		shl	  A, #5 wz
	if_z	sub	 Aexp, #511
	if_z	or	 Aflag, #FLAG_ZERO
	if_z	ret
		'' not all 0, renormalize
		jmp   #Normalize


		''
		'' re-normalize A to 4.28 format
		''
Normalize
		'' check for 0
		mov	pa, A
		or	pa, Alo wz
	if_z	or	Aflag, #FLAG_ZERO
	if_z	ret

		'' shift down if necessary
_down
		test	A, ##$E000_0000 wz
	if_z	jmp	#_up
		add	Aexp, #1
		shr	A, #1 wc
		rcr	Alo, #1 wc
	if_c	or	Aflag, #FLAG_STICKY	' remember we lost bits

		jmp	#_down

_up
		test   A, ##one_4_28 wz
	if_nz	ret

		shl    Alo, #1 wc
		rcl    A, #1
		sub    Aexp, #1
		jmp    #_up
Normalize_ret
		ret

		''
		'' pack a 4.60 number in A,Alo back to an IEEE double
		'' in r1,r0
		''
		'' need to handle rounding and such!
		''
		'' input is assumed to be normalized
		''
DPack
		test	Aflag, #(FLAG_INF|FLAG_NAN|FLAG_ZERO) wz
	if_nz	jmp	#dpack_excep

		call	#Normalize
		'' fix up exponent
		add	Aexp, ##DBIAS_EXP
		fles	Aexp, ##DMAX_EXP-1 wc
	if_c	mov	A, #0
	if_c	or	Aflag, #(FLAG_INF)
	if_c	jmp	#dpack_excep
		cmps	Aexp, #0 wcz
	if_be	call	#dpack_denorm

		'' round here
		'' we clear the implied one first, and allow the
		'' rounding to propagate up to it
		andn	A, ##one_4_28
		test	Aflag, #FLAG_STICKY wz

		'' we have 4.60, we want to round to 4.52
		'' half of the lsb is therefore 0x80
		'' we also want to round to nearest even, so
		'' add a sticky bit if the lsb is set
		test    Alo, #$100 wc
    if_nz_or_c	or	Alo, #1
		add	Alo, #$7f wc
		addx  	A, #0

dpack_exp
		'' now shift down to 12.52
		shr     Alo,#8
		mov     tmp0,A
		shr     A,#8
		shl     tmp0,#24
		or      Alo,tmp0

		shl	Aexp, #20

		mov	r0, Alo
		mov	r1, A
		add	r1, Aexp

		shl	Aflag, #31
		or	r1, Aflag
DPack_ret
		ret

		''
		'' exponent is <=0, so we have to create an IEEE denormalized
		'' number
dpack_denorm

		abs	Aexp, Aexp
		add	Aexp, #1
_ddlp
		shr	A, #1 wc
		rcr	Alo, #1 wc
	if_c	or	Aflag, #FLAG_STICKY
		djnz	Aexp, #_ddlp

		ret
dpack_excep
		mov	A, #0
		mov	Alo, #0
		mov	Aexp, ##DMAX_EXP
		test	Aflag, #FLAG_NAN wz
	if_nz	mov	A, ##one_4_28
	if_nz	shr	A, #1
	if_nz	jmp	#dpack_exp
		test	Aflag, #FLAG_ZERO wz
	if_nz	mov	Aexp, #0
		jmp	#dpack_exp

		''
		'' unpack (r1,r0) into A and (r3,r2) into B
		''
DUnpack2
		mov	A,r3
		mov	Alo,r2
		call	#DUnpack
		mov	B,A
		mov	Blo,Alo
		mov	Bflag,Aflag
		mov	Bexp,Aexp
		mov	A,r1
		mov	Alo,r0
		jmp	#DUnpack


		''
		'' pack a 4.60 number in A back to an IEEE float
		'' in r0
		''
		'' need to handle rounding and such!
		''
		'' input is assumed to be normalized
		''
FPack
		call	#Normalize
		test	Aflag, #(FLAG_INF|FLAG_NAN|FLAG_ZERO) wz
	if_nz	jmp	#fpack_excep

		'' fix up exponent
		add	Aexp, #FBIAS_EXP
		fles	Aexp, #FMAX_EXP-1 wc
	if_c	or	Aflag, #FLAG_INF
	if_c	jmp	#fpack_excep
		cmps	Aexp, #0 wcz
	if_be	call	#fpack_denorm

		'' round here
		'' we clear the implied one first, and allow the
		'' rounding to propagate up to it
		andn	A, ##one_4_28
		cmp	Alo,#0 wz
	if_nz	or	Aflag, #FLAG_STICKY
		test	Aflag, #FLAG_STICKY wz
		'' we have 4.28, we want to round to 4.23
		'' half of the lsb is therefore 0x10
		'' we also round to nearest even, so add a sticky
		'' bit if lsb is set
		test	A, #$20 wc
    if_nz_or_c	or	A, #1

		add	A, #$f

fpack_exp
		'' now shift down to 9.23
		shr     A,#5 wz
		shl	Aexp, #23

		mov	r0, A
		add	r0, Aexp

		shl	Aflag, #31
	_ret_	or	r0, Aflag

		''
		'' exponent is <=0, so we have to create an IEEE denormalized
		'' number
fpack_denorm
		abs	Aexp, Aexp
		add	Aexp, #1	' shift one extra space
_fdlp
		shr	A, #1 wcz
	if_c	or	Aflag, #FLAG_STICKY
	if_z	mov	Aexp, #0
	if_nz	djnz	Aexp, #_fdlp

fpack_denorm_ret
		ret
fpack_excep
		mov	A, #0
		mov	Aexp, #FMAX_EXP

		test	Aflag, #FLAG_NAN wz
	if_nz	mov	r0, ##NAN_32
	if_nz	ret
		test	Aflag, #FLAG_ZERO wz
	if_z	jmp	#fpack_exp
		' A is zero here
		mov	r0, Aflag
	_ret_	shl	r0, #31		' relies on FLAG_SIGN being in lowest bit!


		''
		'' unpack 2 floats in r0,r1 into A,B
		''
FUnpack2
		mov	A, r1
		call	#FUnpack
		mov	Blo,Alo
		mov	B, A
		mov	Bflag,Aflag
		mov	Bexp,Aexp
		mov	A, r0
		jmp	#FUnpack


'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
'' Actual commands start here
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''

		'' addition and subtraction
DSub
		xor	r3, ##$80000000
		'' fall through

DAdd
		call	#DUnpack2
		call	#_Add
		jmp	#DPack
		

DMul
		call	#DUnpack2
		call	#_Mul
		jmp	#DPack

DDiv
		call	#DUnpack2
		call	#_Div
		jmp	#DPack

DSqrt
		mov	A, r1
		mov	Alo, r0
		call	#DUnpack
		call	#_Sqrt
		jmp	#DPack
		
	'' single precision operations
FSub
		xor	r1, ##$80000000
		'' fall through

FAdd
		call	#FUnpack2
		call	#_Add
		jmp	#FPack

FMul
		call	#FUnpack2
		call	#_Mul
		jmp	#FPack

FDiv
		call	#FUnpack2
		call	#_DivSmall
		jmp	#FPack

FSqrtx
		mov	A, r0
		call	#FUnpack
		call	#_Sqrt
		jmp	#FPack
		
	'' conversion operations
	'' single to double
FToD
		mov	A, r0
		call	#FUnpack
		jmp	#DPack

	'' double to single
DToF
		mov	A, r1
		mov	Alo, r0
		call	#DUnpack
		jmp	#FPack

	'' 32 bit signed integer to float
IToF
		abs	A, r0 wc, wz
		mov	Aflag, #0
	if_c	or	Aflag, #FLAG_SIGN
		mov	r2, #0		'' single precision

doint
	if_z	mov	r1, #0
	if_z	ret			'' 0 -> 0
		mov	Alo, #0
		mov	Aexp,#28	'' set the exponent
		cmp	r2, #0 wz
	if_nz	jmp	#dblprec
		jmp	#FPack

dblprec
		jmp	#DPack

	'' 32 bit unsigned integer to float
UIToF
		mov	A, r0  wcz
		mov	Aflag, #0
		mov	r2, #0		'' single precision
		jmp	#doint

	'' 32 bit signed integer to double
IToD
		abs	A, r0  wcz
		mov	Aflag, #0
	if_c	or	Aflag, #FLAG_SIGN
		mov	r2, #1		'' double precision
		jmp	#doint
	'' 32 bit unsigned integer to double
UIToD
		mov	A, r0 wz
		mov	Aflag, #0
		mov	r2, #1		'' double precision
		jmp	#doint


'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
'' Utility functions go here
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''

		''
		'' the actual add routine
		''
_Add
		'' swap so magnitude of A is bigger than that of B
		'' NOTE: we are assuming here that infinity is given
		'' a big Aexp, and 0 a very tiny one
		cmps	Aexp, Bexp wcz
	if_a	jmp  	#_addnoswap
	if_b	jmp	#_addswap
		cmp	Alo,Blo wcz
		cmpx	A,B	wcz
	if_ae	jmp	#_addnoswap
_addswap
		mov	tmp0,Aflag
		mov	tmp1,Aexp
		mov	Aflag,Bflag
		mov	Aexp, Bexp
		mov	Bflag,tmp0
		mov	Bexp, tmp1
		mov	tmp0, A
		mov	tmp1, Alo
		mov	A, B
		mov	Alo, Blo
		mov	B, tmp0
		mov	Blo, tmp1

_addnoswap
		'' shift B down as necessary
		'' if we are shifting more than 63 then
		'' we just return the original value

		mov	tmp0,Aexp
		sub	tmp0,Bexp wz
	if_z	jmp	#_doadd
		fle	tmp0,#63 wc
		'' FIXME: should raise inexact here?
	if_c	ret

		'' check for short shift
		cmp	tmp0,#32 wcz
	if_b	jmp	#_addshift

		'' shifting B by more than 32
		cmp	Blo,#0   wz
	if_nz	or	Aflag, #FLAG_STICKY
		mov	Blo, B
		mov	B, #0
		sub	tmp0,#32 wz
_addshift
		'' now shift B, Blo by tmp0 which is < 32
		'' (B,Blo) >> tmp0 = (B>>tmp0),  (B<<tmp1)|(Blo>>tmp0)
		mov	 tmp1,#32
		sub	 tmp1,tmp0
		mov	 tmp2,Blo
		shl	 tmp2,tmp1 wz
	if_nz	or	 Aflag, #FLAG_STICKY
		shr	 Blo, tmp0
		mov	 tmp2, B
		shr	 B, tmp0
		shl	 tmp2, tmp1
		or	 Blo, tmp2
_doadd
		'' now perform the addition
		mov	tmp0, Aflag
		xor     tmp0, Bflag
		test    tmp0, #FLAG_SIGN wz
	if_nz	jmp     #_dosub
		add     Alo, Blo wc
		addx    A, B
		ret
_dosub
		'' check for INF - INF
		'' note that if B is INF, then A is NAN or INF, so
		'' in either case NAN is appropriate to return
		test	Bflag, #FLAG_INF wz
	if_nz	or	Aflag, #FLAG_NAN
		test	Aflag, #FLAG_STICKY wc
		subx	Alo, Blo wcz
		subx	A, B
		mov	Blo, Alo
		or	Blo, A wz
	if_z	andn	Aflag, #FLAG_SIGN
	if_z	or	Aflag, #FLAG_ZERO
		ret


		'' MulAcc128:
		'' multiply B,Blo * C,Clo
		'' and accumulate result into A,Alo,Alo2,Alo3
_MulAcc128
		qmul	Blo, Clo
		mov	rs1, B
		mov	rs2, C
		call	#imp_mulhu	' rs1 = low word, rd = high word
		add	Alo, rs1 wc
		addx	A, rd

		getqx	tmp0
		getqy	tmp1

		add	Alo3, tmp0 wc
		addx	Alo2, tmp1 wc
		addx	Alo, #0 wc
		addx	A, #0 wc
		
		mov	tmp0, Blo
		or	tmp0, Clo wz
	if_z	jmp	#_mulacc128_shift_short

		'' now do the cross products
		cmp    Clo, #0 wz
	if_nz	qmul   B, Clo

		mov    rs1, Blo  wz
	if_z	mov    rd, #0
		mov    rs2, C
	if_nz	call   #imp_mulhu	' rs1 = low word, rd = high word

		add    Alo2, rs1 wc
		addx   Alo, rd wc
		addx   A, #0

		cmp    Clo, #0 wz
	if_nz	getqx  tmp0
	if_nz	getqy  tmp1
		
	if_nz	add    Alo2, tmp0 wc
	if_nz	addx   Alo, tmp1 wc
	if_nz   addx   A, #0
	
_mulacc128_shift
		'' now we have the result as an 8.120 bit number in A, Alo, Alo2, Alo3
		'' shift it up by 4 to get 4.128
		shl	 A, #4
		getnib	 tmp0, Alo, #7
		or	 A, tmp0
		
		shl	 Alo, #4
		getnib	 tmp0, Alo2, #7
		or	 Alo, tmp0

		shl	 Alo2, #4
		getnib	 tmp0, Alo3, #7
		or	 Alo2, tmp0

	_ret_	shl	 Alo3, #4

_mulacc128_shift_short
		'' now we have the result as an 8.120 bit number in A, Alo, Alo2, Alo3
		'' shift it up by 4 to get 4.128
		shl	 A, #4
		getnib	 tmp0, Alo, #7
		or	 A, tmp0		
  _ret_		shl	 Alo, #4

		'' the actual multiply routine
_Mul
		mov	tmp0,Aflag
		or	tmp0,Bflag
		test	tmp0,#(FLAG_INF|FLAG_NAN) wz
	if_nz	jmp	#_mul_excep
		'' regular multiply
		add	Aexp,Bexp
		
		'' calculate (A,Alo) * (B, Blo)
		'' both are 4.60 numbers
		mov	C, A wz
	if_z	jmp	#_mul_sign
		mov	Clo, Alo
		mov	A, #0
		mov	Alo, #0
		mov	Alo2, #0
		mov	Alo3, #0
		cmp	B, #0 wz
	if_z	jmp	#_mul_sign
		call	#_MulAcc128

		or	 Alo2, Alo3 wz
	if_nz	or	 Aflag, #FLAG_STICKY
	
_mul_sign
		and	Bflag, #FLAG_SIGN	' adjust sign of result
	_ret_	xor	Aflag, Bflag

		'' special cases for zero, inf, NaN
_mul_excep
		'' if we get here, we know that either the
		'' NAN or INF bit is set
		'' if 0 is set as well, we have an illegal condition
		'' NAN*anything = NAN
		'' 0*inf == NAN
		test	tmp0, #(FLAG_NAN|FLAG_ZERO) wz
	if_nz	or	Aflag,#FLAG_NAN
	if_z	or	Aflag,#FLAG_INF
		jmp	#_mul_sign

		''
		'' the actual division routine
		''
		''
		'' (A, Alo) / (B, Blo)
		''
		'' DivSmall is for when Alo and Blo are both 0
		''
_DivSmall
		'' start div, assuming we will need it
		shr	A, #4 	       ' bottom bits of A are 0
		qfrac	A, B

		'' set sign of result
		mov    tmp0, Aflag
		xor    tmp0, Bflag
		test   tmp0, #FLAG_SIGN wz
		muxnz  Aflag, #FLAG_SIGN
		
		mov	tmp0, Aflag
		or	tmp0, Bflag
		'' check for divide by infinity or NAN
		test	tmp0, #(FLAG_INF|FLAG_NAN) wz
	if_nz	jmp	#_div_excep
		'' check for divide by 0
		test	Bflag, #FLAG_ZERO wz
	if_nz	jmp	#_div_by_zero

		sub	Aexp, Bexp
		getqx	A       ' quotient
		getqy	tmp0	' remainder
		cmp	tmp0, #0 wz
	if_nz	or	Aflag, #FLAG_STICKY
		ret

		
		''
		'' perform 4.60 / 4.60 division
		''
_Div
		mov	count,#61
_doDiv
		'' set the sign of the result
		mov	tmp0, Aflag
		xor	tmp0, Bflag
		test	tmp0, #FLAG_SIGN wz
		muxnz	Aflag,#FLAG_SIGN

		mov	tmp0, Aflag
		or	tmp0, Bflag
		'' check for divide by infinity or NAN
		test	tmp0, #(FLAG_INF|FLAG_NAN) wz
	if_nz	jmp	#_div_excep
		'' check for divide by 0
		test	Bflag, #FLAG_ZERO wz
	if_nz	jmp	#_div_by_zero

		'' regular divide loop here
		sub	Aexp, Bexp
		mov	tmp0, Alo
		mov	tmp1, A
		'' initialize quotient
		mov	A, #0
		mov	Alo, #0
_divloop
		cmp	tmp0, Blo wcz
		cmpx	tmp1, B wcz
	if_b	jmp	#_div_skip_sub
		sub	tmp0, Blo wcz
		subx	tmp1, B
		shl	Alo, #1 wc
		or	Alo, #1
		jmp	#_div_next
_div_skip_sub
		shl	Alo, #1 wc
_div_next
		rcl	A, #1
		shl	tmp0, #1 wc
		rcl	tmp1, #1
		djnz	count, #_divloop

		'' set sticky bit if necessary
		or     	 tmp0,tmp1 wz
	if_nz	or	 Aflag, #FLAG_STICKY

_Div_ret
_DivSmall_ret
		ret

_div_by_zero
		test	Aflag, #(FLAG_NAN|FLAG_INF|FLAG_ZERO) wz
	if_nz	or	Aflag, #FLAG_NAN
	if_z	or	Aflag, #FLAG_INF
		jmp	#_Div_ret

		''
		'' if some number is infinity or NaN, come here
		''
_div_excep
		test	tmp0, #FLAG_NAN wz
_div_nan
	if_nz	or	Aflag, #FLAG_NAN
	if_nz	jmp	#_Div_ret

		test	Aflag, #FLAG_INF wz
	if_z	jmp	#_a_finite
		'' infinity/x
		test	Bflag, #(FLAG_INF) wz
	if_nz	jmp	#_div_nan
		jmp	#_Div_ret

		'' x/infinity
_a_finite
		or	Aflag, #FLAG_ZERO
		mov	A, #0
		mov	Alo, #0
		jmp	#_Div_ret

		''
		'' square root calculation
		''
_Sqrt
		test	Aflag, #(FLAG_NAN|FLAG_ZERO) wz
	if_nz	ret
		'' sqrt(-x) -> NaN
		test	Aflag, #(FLAG_SIGN) wz
	if_nz	jmp	#sqrt_excep
		test	Aflag, #FLAG_INF wz	' sqrt(inf) == inf
	if_nz	ret
	
		mov	Alo2, #0
		' originally our mantissa is set up with 1 <= A < 2
		' and in 4.60 format
		' convert to 2.62 format
		' note that 1 * 2^60 = 4 * 2^58
		shl	Alo, #1 wc
		rcl	A, #1
		shl	Alo, #1 wc
		rcl	A, #1
		sub	Aexp, #2
		
		test	Aexp, #1 wz	' make Aexp even if necessary
	if_nz	add	Aexp, #1
	if_nz	shr	A, #1 wc
	if_nz	rcr	Alo, #1
	
		'' get square root
#ifdef DEBUG
		mov	uart_char, #">"
		call	#ser_tx
		mov	uart_num, A
		call	#ser_hex
		mov	uart_num, Alo
		call	#ser_hex
		mov	uart_char, #"^"
		mov	uart_num, Aexp
		call	#ser_hex
		call	#ser_nl
#endif		
		qsqrt	Alo, A

		'' save original value of A
		mov	Cexp, Aexp
		mov	C, A
		mov	Clo, Alo
		mov	Cflag, Aflag

		' do square root
		sar	Aexp, #1
		getqx	A	' A has square root in 1.31 format
		
		test	Cflag, #FLAG_DOUBLE wz
	if_nz	jmp	#sqrt_heron
		cmps	Cexp, ##-FBIAS_EXP wcz
	if_be	jmp	#sqrt_heron

		'' convert to 4.28
		mov	Alo, A
		shl	Alo, #30
		shr	A, #2
		
		'' FIXME: should check here for exact squares,
		'' for which FLAG_STICKY is not appropriate
	_ret_	or	Aflag, #FLAG_STICKY
		
sqrt_heron
		'' have to do an iteration of Heron's algorithm to get
		'' the low bits
		'' we have C = origA (2.30)
		''         A = x0 (1.31)
		'' calculate x1 = 1/2 * (x0 + origA / x0)
		''
		mov	Alo, #0

		'' x0 is 1.31
		''
		'' compute origA / x0
		''
		'' we know 0.5 <= x0 < 2
		'' and     0.5 <= origA < 4
		''         
		'' Q = 2^32
		'' S = 2^60
		'' T = 2^62
		''
		'' origA = (C*Q + Clo) / T
		'' x0 = B / R, where R = 2^31
		''
		'' so T * origA / x0 =  T * ((C*Q + Clo) / T) * (R / A)
		''                   =  T * ((C*Q + Clo) / A) * (R / T)
		''                   =  ((C*Q + Clo) / A) * R
		''

		'' we're calculating 2^32 * C / A
		'' what we really want is 2^31 * C / A
		'' so our value is twice as big as it should be
		setq	C
		qdiv	Clo, A
		getqx	B		' B is the quotient
		getqy	tmp1		' tmp1 is the remainder

		qfrac	tmp1, A
		getqx	Blo
		getqy	tmp1

		'' so here B,Blo = 2*(origA / x0) as a 2.30 number
		'' or, equivalently, origA / x0 as a 1.31 number

		
		'' now make A = (x0 + (origA / x0)) as a 1.31 number
		add	Alo, Blo wc
		addx	A, B wc

		'' finally divide by 2
		rcr	A, #1 wc
		rcr	Alo, #1

		'' we have a 1.63 number here, but want 4.60
		shr	A, #1 wc
		rcr	Alo, #1
		shr	A, #1 wc
		rcr	Alo, #1
		
#ifdef DEBUG
		mov	uart_char, #"i"
		call	#ser_tx
		mov	uart_num, A
		call	#ser_hex
		mov	uart_num, Alo
		call	#ser_hex
		call	#ser_nl
#endif
#ifdef EXACT
		'' OK, let's calculate the error A*A - origA
		'' origA is in C
		'' so set B = A*A
		qmul	A, Alo
		mov	rs1, A
		mov	rs2, A
		call	#imp_mulhu ' result in rd, rs1
		mov	B, rd
		mov	Blo, rs1
		mov	rs1, Alo
		mov	rs2, Alo
		call	#imp_mulhu  ' result in rd, rs1
		'' so now partial result is:
		'' B, Blo, rd, rs1
		getqx	tmp1
		getqy	tmp0
		'' double (tmp0, tmp1)
		add	tmp1, tmp1 wc
		addx	tmp0, tmp0 wc
		addx	B, #0
		
		'' B, Blo + tmp0, rd + tmp1, rs1
		add	tmp1, rd wc
		addx	Blo, tmp0 wc
		addx	B, #0
		
		'' partials are in 8.120 format
		'' need to shift up to 4.124
		shl	 B, #4
		getnib	 tmp0, Blo, #7
		or	 B, tmp0
		
		shl	 Blo, #4
		getnib	 tmp0, tmp1, #7
		or	 Blo, tmp0
		
		shl	 tmp1, #4
		getnib	 tmp0, rs1, #7
		or	 tmp1, tmp0
		shl	 rs1, #4
#ifdef DEBUG
		mov	uart_char, #"*"
		call	#ser_tx
		mov	uart_num, B
		call	#ser_hex
		mov	uart_num, Blo
		call	#ser_hex
		mov	uart_num, tmp1
		call	#ser_hex
		mov	uart_num, rs1
		call	#ser_hex
		call	#ser_nl
		mov	uart_char, #"%"
		call	#ser_tx
		mov	uart_num, C
		call	#ser_hex
		mov	uart_num, Clo
		call	#ser_hex
		mov	uart_num, #0
		call	#ser_hex
		call	#ser_nl
#endif

		'' B, Blo, tmp1, rs1 has 128 bit A*A
		or	rs1, tmp1 wz
	if_nz	or	Aflag, #FLAG_STICKY	' any bits chopped off?
	
		'' calculate e = A*A - origA
		sub	rs1, #0 wcz
		subx	tmp1, #0 wcz
		subx	Blo, Clo wcz
		subx	B, C wcz

		'' here e = A*A - origA
		'' if e > 0, then we've overestimated and need to back off
	
		'' we want it to be an underestimate, so if it is not
		'' then subtract 1
		mov	tmp1, #0
	if_a	mov	tmp1, #1

		sub	Alo, tmp1 wc
		subx	A, #0
		ret
#else
	_ret_	or	Aflag, #FLAG_STICKY
#endif
sqrt_excep
		mov	A, #0
		mov	Alo, #0
	_ret_	or	Aflag, #FLAG_NAN
		
{{


+------------------------------------------------------------------------------------------------------------------------------+
|                                                   TERMS OF USE: MIT License                                                  |                                                            
+------------------------------------------------------------------------------------------------------------------------------+
|Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation    | 
|files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,    |
|modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software|
|is furnished to do so, subject to the following conditions:                                                                   |
|                                                                                                                              |
|The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.|
|                                                                                                                              |
|THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE          |
|WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR         |
|COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,   |
|ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                         |
+------------------------------------------------------------------------------------------------------------------------------+
}}