Skip to content

Reverse engineering CRF

Stan Bobovych edited this page Jan 22, 2016 · 120 revisions

References:
https://msdn.microsoft.com/en-us/library/windows/desktop/bb509656%28v=vs.85%29.aspx

Specific references: Mapping texels to pixels: https://msdn.microsoft.com/en-us/library/windows/desktop/bb219690%28v=vs.85%29.aspx
Destination register masks: https://msdn.microsoft.com/en-us/library/windows/desktop/bb172949%28v=vs.85%29.aspx
How input and output semantics work: https://msdn.microsoft.com/en-us/library/windows/desktop/bb944006%28v=vs.85%29.aspx
D3D types: https://msdn.microsoft.com/en-us/library/windows/desktop/bb172533%28v=vs.85%29.aspx
D3DVERTEXELEMENT9 structure: https://msdn.microsoft.com/en-us/library/windows/desktop/bb172630%28v=vs.85%29.aspx

Learn Vertex & Pixel Shader Programming With DirectX 9 - great examples of shader asm
Shaders for Game Programmers and Artists - discussion about bump map vs normal map
Real Time Rendering - more theoretical description of normal mapping
ShaderX 1 - example of using D3DCOLOR to store normals, tangents and bitangents, example of compressed vertex stream declaration
Real Time shader Programming - transforming normal vecotrs
Data Structures and Algorithms for Game Developers - some details about DXT compression and normal maps
Real Time Rendering Tricks and Techniques in DirectX - extensive examples of pixel shaders and bump mapping

cX registers are constants, rX registers are temporary registers

Static meshes

Models that don't have skeletons have the following vertex buffer layout:

Stream Offset Type Method Usage UsageIndex Remark
0 0 FLOAT3 DEFAULT POSITION 0 Vertex position
0 12 D3DCOLOR DEFAULT COLOR 0 Normal
0 16 D3DCOLOR DEFAULT COLOR 1 Unknown (Tangents?)
0 20 SHORT2N DEFAULT TEXCOORD 0 Texture coordinates
0 24 SHORT2N DEFAULT TEXCOORD 1 Unknown
0 28 D3DCOLOR DEFAULT BLENDWEIGHT 1 Unknown
D3DDECL_END
/* Specify buffer layout with type/name pairs, e.g. "float3 position;"
   HLSL Base types: bool, byte, short, int, half, float, double
   HLSL Vector Types: float3, vector<uint,3>, float3x3, matrix<xshort,2,2>
   Modifiers for byte/short/int: s=signed, u=unsigned, x=unsigned hex */

float3 position;
ubyte4 normal;
ubyte4 tangent;
ushort2 uv0;
ushort2 uv1;
ubyte4 blendweight;

USP 45 example

Vertex shader float constant registers:

Register Remark
0 -0.895 -0.053 -0.443 -633.405 World Transform
1 0.003 0.992 -0.124 2.008 World Transform
2 0.446 -0.113 -0.888 -1301.420 World Transform
3 -0.117 0.000 2.340 2943.542 View Transform?
4 -3.797 1.701 -0.190 -2779.526 View Transform?
5 -0.409 -0.916 -0.020 226.678 View Transform?
6 -0.408 -0.913 -0.020 232.955 View Transform?
7 -450.501 485.219 -1280.439 0.000 Eye, camera or light direction?
8 -0.001 1.490 -0.009 0.909
9 0.173 0.212 0.365 0.000 Eye, camera or light direction?
10 -0.408 -0.913 -0.020 232.955
11 0.500 0.500 0.500 0.500
12 0.500 0.500 0.500 0.500
13 1.000 1.000 0.000 0.000 some kind of scaling factor for diffuse UVs?
14 2.000 -1.000 1.000 0.000 Set in vertex shader

Specular constant (defined in object materials) is passed through pixel shader constant float register c8. Example, USP 45 has a specular constant of (0.170, 0.170, 0.170, 0). Looking at device state at the call to DrawIndexedPrimitive that constant will be in that register.

Pixel shader float constant registers:

Register Remark
0 -643582208.000 500000000.000 -579484160.000 0.000
1 0.325 0.346 0.424 0.000
2 -661.576 -19.550 -1353.910 0.000
3 0.400 0.400 0.350 1.000
4 -671.998 -40.875 -1263.387 0.000
5 0.320 0.320 0.400 1.000
6 0.294 0.345 0.686 0.000
7 0.090 0.090 0.090 0.000
8 0.170 0.170 0.170 0.000 specular constant?
9 0.000 0.000 0.000 0.000
10 0.090 0.090 0.090 0.000
11 3.000 15.000 0.100 0.200
12 0.120 0.300 0.900 0.000
13 0.000 0.000 0.000 0.000

Vertex shader:

    vs_3_0
    // c14 is a constant 
    def c14, 2, -1, 1, 0 // c14.xyzw = {2,-1,1,0}
    dcl_position v0     // vertex position in register v0
    dcl_blendweight1 v1 // blendweight1 in register v1
    dcl_texcoord v2     // UVs (texture coordinates)
    dcl_texcoord1 v3    // 2nd set of UVs?
    dcl_color v4        // normals
    dcl_color1 v5       // tangents?
    dcl_position o0     // output transformed vertex position
    dcl_texcoord o1         // output {non-scaled diffuse UVs, non-scaled 2nd set of UVs}
    dcl_texcoord1 o2.xyz    // some normalized vector
    dcl_texcoord2 o3.xyz    // output vector from position to something
    dcl_texcoord3 o4.xyz    // output transformed tangent
    dcl_texcoord4 o5.xyz    // output transformed blendweights
    dcl_texcoord5 o6.xyz    // output transformed normal
    dcl_texcoord6 o7        // some kind of constant, not used in ps
    dcl_texcoord8 o8
    dcl_texcoord9 o9.xy     // output scaled diffuse UVs

    // transform normals
    // mad = multiply add
    // Expand from compressed D3DCOLOR to –1 to 1 range
    mad r0.xyz, v4.zyxw, c14.x, c14.y    // r0.x = normal.z * 2 - 1
                                         // r0.y = normal.y * 2 - 1
                                         // r0.z = normal.x * 2 - 1
                                         // r0.w is not updated due to mask!
    // dp3 = 3 element dot product
    // since v4 has the normals, 
    // it looks like WorldViewProjection transform is orthogonal since the transformed normal was not renormalized
    // If 3x3 matrix only contains rotations, it's transpose = inverse,
    // so another theory is that c0,c1,c2 is only WorldTransform (Transpose) matrix since it's a 3x3 instead of 4x4
    // m3x3 r0, c0 (macro)
    dp3 o6.x, r0, c0    // o6.x = r0.x*c0.x  + r0.y*c0.y + r0.z*c0.z
    dp3 o6.y, r0, c1
    dp3 o6.z, r0, c2
    mov r0.w, c14.z     // r0.w = 1   

    // transform vertex positions
    mad r1, v0.xyzx, c14.zzzw, c14.wwwz    // r1.x = position.x * 1 + 0
                                           // r1.y = position.y * 1 + 0
                                           // r1.z = position.z * 1 + 0
                                           // r1.w = position.x * 0 + 1
    // dp4 = 4 element dot product
    // Here the vertex must be transformed by the transform matrix, but I am not sure how it's constructed.
    // Maybe c0,c1,c2 is the WorldTransform and c3,c4,c5,c6 is ViewTransform*Projection?
    // m3x3 r1, c0 (macro)
    dp4 r0.x, r1, c0    // r0.x = r1.x*c0.x + r1.y*c0.y + r1.z*c0.z + r1.w*c0.w
    dp4 r0.z, r1, c2
    dp4 r0.y, r1, c1    
    // m4x4 r0, c3 (macro)
    dp4 o0.x, r0, c3   // output position = r0 dot product c3
    dp4 o0.y, r0, c4
    dp4 o0.z, r0, c5
    dp4 o0.w, r0, c6
    dp4 r0.w, r0, c10
    mad_sat r0.w, r0.w, c8.x, c8.y

    // transform tangents
    // Expand from compressed D3DCOLOR to –1 to 1 range
    mad r1.xyz, v5.zyxw, c14.x, c14.y    // r1.x = v5.z * 2 - 1
                                         // r1.y = v5.y * 2 - 1
                                         // r1.z = v5.x * 2 - 1
    // m3x3 r1, c0 (macro)
    dp3 o4.x, r1, c0
    dp3 o4.y, r1, c1
    dp3 o4.z, r1, c2

    // transform blendweights
    // Expand from compressed D3DCOLOR to –1 to 1 range
    mad r1.xyz, v1.zyxw, c14.x, c14.y    // r1.x = v1.z * 2 - 1
                                         // r1.y = v1.y * 2 - 1
                                         // r1.z = v1.x * 2 - 1
    // m3x3 r1, c0 (macro)
    dp3 o5.x, r1, c0
    dp3 o5.y, r1, c1
    dp3 o5.z, r1, c2
    mad r1.x, r0.y, c8.z, c8.w
    add r1.y, -r0.w, c14.z
    mul_sat r1.x, r1.x, r1.y
    add_sat o8.w, r0.w, -r1.x

    // transform 2nd set of UV
    mad o1.zw, v3.xyxy, c12.xyxy, c12    // output texcord1.z = v3.x * 0.5 + 0.5
                                         // output texcord1.w = v3.y * 0.5 + 0.5

    // transform 1st set of UV
    // is adjustment by 0.5 for mapping texels to pixels?
    mad r1.xy, v2, c11, c11.zwzw    // temp1.x = v2.x * 0.5 + 0.5
                                    // temp1.y = v2.y * 0.5 + 0.5
    mul o9.xy, r1, c13              // output texcoord09.x = temp1.x * 1
                                    // output texcoord09.y = temp1.y * 1
    mov o1.xy, r1                   // output texcoord01 (UVs) = temp1

    // calculate and normalize some vector
    // c7 is either light, camera or most likely eye position
    add r1.xyz, r0, -c7             // vertex world position - c7
    mov o3.xyz, r0                  // save vertex world position in o3
    dp3 r0.x, r1, r1                // r0.x = r1.x*r1.x + r1.y*r1.y + r1.z*r1.z (sum of squares)
    rsq r0.x, r0.x                  // r0.x = 1 / sqrt(r0.x) = 1/d
    mul o2.xyz, r1, r0.x            // o2.x = r1.x * 1/d
                                    // o2.y = r1.y * 1/d
                                    // o2.z = r1.z * 1/d (normalized each component)
                                    // o2 has a normalized light, camera or eye vector

    // unknown
    mov o7, c14.zzww
    mov o8.xyz, c9

// approximately 38 instruction slots used

Pixel shader:

    ps_3_0
    def c10, 2, -1, 1, 0.5
    def c11, 0, 32, 0, 0
    dcl_texcoord v0.xy     // non-scaled diffuse UVs
    dcl_texcoord1 v1.xyz   // 
    dcl_texcoord2 v2.xyz
    dcl_texcoord3 v3.xyz   // transformed tangents
    dcl_texcoord4 v4.xyz   // transformed blendweights 
    dcl_texcoord5 v5.xyz   // transformed normals
    dcl_texcoord8 v6
    dcl_texcoord9 v7.xy    // scaled diffuse UVs
    // 2d samplers, only two textures are used
    dcl_2d s0
    dcl_2d s1
    add r0.xyz, c2, -v2
    dp3 r0.w, r0, r0             // r0.w = r0.x*r0.x + r0.y*r0.y + r0.z*r0.z
    mov r1.z, c10.z              // r1.z = 1
    mad r1.x, r0.w, -c2.w, r1.z  // r1.x = r0.w * -c2.w + 1
    rsq r0.w, r0.w               // r0.w = 1/sqrt(r0.w)
    max r2.x, r1.x, c11.x        // r2.x = max(r1.x , 0)
    add_sat r1.x, r2.x, r2.x
    mul r1.xyw, r1.x, c3.xyzz
    mul r2.xyz, r1.xyww, c8
    nrm r3.xyz, v1
    mad r4.xyz, r0, r0.w, -r3
    mul r0.xyz, r0, r0.w
    nrm r5.xyz, r4
    // sample s1 with coordinate v7 and store in r4, v7.w contains the level of details to use for sampling
    texld r4, v7, s1
    mad r4.xy, r4.wyzw, c10.x, c10.y    // r4.x = r4.w * 2 - 1
                                        // r4.y = r4.y * 2 - 1
    mul r6.xyz, r4.y, v4
    mad r6.xyz, r4.x, v3, r6
    dp2add r0.w, r4, -r4, c10.z     // r0.w = r4.x*-r4.x + r4.y*-r4.y + 1
    rsq r0.w, r0.w                  // r0.w = 1 / sqrt(r0.w)
    rcp r0.w, r0.w                  // r0.w  = sqrt(r0.w)
    mad r4.xyz, r0.w, v5, r6
    dp3_sat r0.w, r4, r5
    pow_sat r2.w, r0.w, c11.y       // r2.w = (r0.w)^32
    mul r2.xyz, r2, r2.w
    mul r5.xyz, r2, c3.w
    add r6.xyz, c0, -v2
    dp3 r0.w, r6, r6
    rsq r2.w, r0.w
    mad r0.w, r0.w, -c0.w, r1.z
    max r3.w, r0.w, c11.x           // r3.w = max(r0.w, 0)
    add_sat r0.w, r3.w, r3.w
    mul r7.xyz, r0.w, c1
    mad r8.xyz, r6, r2.w, -r3
    mul r6.xyz, r6, r2.w
    dp3_sat r0.w, r4, r6
    mul r6.xyz, r7, r0.w
    mul r7.xyz, r7, c8
    nrm r9.xyz, r8                  // r0.xyz = |r8|
    dp3_sat r0.w, r4, r9
    pow_sat r2.w, r0.w, c11.y       // r2.w = (r0.w)^32
    mul r7.xyz, r7, r2.w
    mad r5.xyz, r7, c1.w, r5
    add r8.xyz, c4, -v2
    dp3 r0.w, r8, r8
    rsq r2.w, r0.w
    mad r0.w, r0.w, -c4.w, r1.z
    max r3.w, r0.w, c11.x           // r3.w = max(r0.2, 0)
    add_sat r0.w, r3.w, r3.w
    mul r9.xyz, r0.w, c5
    mad r3.xyz, r8, r2.w, -r3
    mul r8.xyz, r8, r2.w
    dp3_sat r0.w, r4, r8
    mul r8.xyz, r9, r0.w
    mul r9.xyz, r9, c8
    nrm r10.xyz, r3
    dp3_sat r0.w, r4, r10
    pow_sat r2.w, r0.w, c11.y       // r2.w = r0.w^32
    mul r3.xyz, r9, r2.w
    mad r5.xyz, r3, c5.w, r5
    add r0.w, r1.z, -c3.w
    mul r2.xyz, r2, r0.w
    add r2.w, r1.z, -c1.w
    mad r2.xyz, r7, r2.w, r2
    add r1.z, r1.z, -c5.w
    mad r2.xyz, r3, r1.z, r2
    add r2.xyz, r2, r2
    mad r2.xyz, r5, c10.x, r2       // r2.x = r5.x * 2 - r2.x
                                    // r2.y = r5.y * 2 - r2.y
                                    // r2.z = r5.z * 2 - r2.z
    dp3_sat r0.x, r4, r0
    mad r0.y, r4.y, c10.w, c10.w    // r0.y = r4.y * 0.5 + 0.5
    mul r1.xyw, r1, r0.x
    mul r0.xzw, r0.w, r1.xyyw
    mul r1.xyw, r1, c3.w
    mad r1.xyw, r6.xyzz, c1.w, r1
    mad r0.xzw, r6.xyyz, r2.w, r0
    mad r0.xzw, r8.xyyz, r1.z, r0
    mad r1.xyz, r8, c5.w, r1.xyww
    add r0.xzw, r0, r0
    mad r0.xzw, r1.xyyz, c10.x, r0  // r0.x = r1.x * 2 + r0.x
                                    // r0.z = r1.y * 2 + r0.y
                                    // r0.w = r1.y * 2 + r0.z
    mov r1.xyz, c7
    add r1.xyz, -r1, c6
    mad r1.xyz, r0.y, r1, c7
    add r0.xyz, r0.xzww, r1
    mov r0.w, c8.w
    mad r0.xyz, c9, r0.w, r0
    // sample s0 with coordinate v0 and store in r1
    texld r1, v0, s0
    mad r0.xyz, r1, r0, r2
    mov oC0.w, r1.w
    add r0.xyz, r0, -v6
    // this sets the final output color
    mad oC0.xyz, v6.w, r0, v6

// approximately 104 instruction slots used (2 texture, 102 arithmetic)

Skinned meshes

Stream Offset Type Method Usage UsageIndex
0 0 FLOAT3 DEFAULT POSITION
0 12 D3DCOLOR DEFAULT COLOR
0 16 D3DCOLOR DEFAULT COLOR
0 20 SHORT2N DEFAULT TEXCOORD
0 24 SHORT2N DEFAULT TEXCOORD
0 28 BLENDWEIGHT DEFAULT BLENDWEIGHT
2 0 BLENDWEIGHT DEFAULT BLENDWEIGHT
2 4 BLENDWEIGHT DEFAULT BLENDWEIGHT
D3DDECL_END
vs_3_0
    def c242, 2, -1, 765.005859, -0
    def c243, -0.159154937, 0.5, 6.28318548, -3.14159274
    dcl_position v0
    dcl_blendweight v1
    dcl_blendweight1 v2
    dcl_blendindices v3
    dcl_texcoord v4
    dcl_texcoord1 v5
    dcl_color v6
    dcl_color1 v7
    dcl_position o0
    dcl_texcoord o1
    dcl_texcoord1 o2.xyz
    dcl_texcoord2 o3.xyz
    dcl_texcoord3 o4.xyz
    dcl_texcoord4 o5.xyz
    dcl_texcoord5 o6.xyz
    dcl_texcoord6 o7
    dcl_texcoord8 o8
    dcl_texcoord9 o9.xy
    mad r0.xyz, v6.zyxw, c242.x, c242.y
    mul r1, c242.z, v3.zyxw
    mova a0, r1
    mul r1, v1.y, c11[a0.y]
    mad r1, v1.z, c11[a0.x], r1
    mad r1, v1.x, c11[a0.z], r1
    mad r1, v1.w, c11[a0.w], r1
    dp3 r2.x, r0, r1
    mul r3, v1.y, c12[a0.y]
    mad r3, v1.z, c12[a0.x], r3
    mad r3, v1.x, c12[a0.z], r3
    mad r3, v1.w, c12[a0.w], r3
    dp3 r2.y, r0, r3
    mul r4, v1.y, c13[a0.y]
    mad r4, v1.z, c13[a0.x], r4
    mad r4, v1.x, c13[a0.z], r4
    mad r4, v1.w, c13[a0.w], r4
    dp3 r2.z, r0, r4
    nrm r0.xyz, r2
    dp3 o6.x, r0, c0
    dp3 o6.y, r0, c1
    dp3 o6.z, r0, c2
    mad r0, v0.xyzx, -c242.yyyw, -c242.wwwy
    dp4 r2.x, r0, r1
    dp4 r2.y, r0, r3
    dp4 r2.z, r0, r4
    mov r2.w, -c242.y
    dp4 r0.x, r2, c0
    dp4 r0.z, r2, c2
    dp4 r0.y, r2, c1
    mov r0.w, -c242.y
    dp4 o0.x, r0, c3
    dp4 o0.y, r0, c4
    dp4 o0.z, r0, c5
    dp4 o0.w, r0, c6
    dp4 r0.w, r0, c10
    mad_sat r0.w, r0.w, c8.x, c8.y
    mad r2.xyz, v7.zyxw, c242.x, c242.y
    dp3 r5.x, r2, r1
    dp3 r5.y, r2, r3
    dp3 r5.z, r2, r4
    dp3 o4.x, r5, c0
    dp3 o4.y, r5, c1
    dp3 o4.z, r5, c2
    mad r2.xyz, v2.zyxw, c242.x, c242.y
    dp3 r1.x, r2, r1
    dp3 r1.y, r2, r3
    dp3 r1.z, r2, r4
    dp3 o5.x, r1, c0
    dp3 o5.y, r1, c1
    dp3 o5.z, r1, c2
    add r1.xyz, r0.xzzw, -c236.xyyw
    mov r2.z, c237.z
    mad r1.w, r2.z, c243.x, c243.y
    frc r1.w, r1.w
    mad r1.w, r1.w, c243.z, c243.w
    sincos r2.xy, r1.w
    mul r1.yzw, r1.xxyz, r2.xxyx
    mad r2.z, r1.x, -r2.y, r1.w
    add r2.x, r1.z, r1.y
    add r1.xz, -r2, c236.zyww
    add r2.y, r0.y, -c237.x
    add r1.y, -r2.y, c237.y
    min r1.xyz, r2, r1
    min r1.y, r1.z, r1.y
    min r1.x, r1.x, r1.y
    add r1.x, r1.x, -c237.w
    mul r1.x, r1.x, c238.w
    mad r1.y, r0.y, c8.z, c8.w
    add r1.z, -r0.w, -c242.y
    mul_sat r1.y, r1.y, r1.z
    add_sat r0.w, r0.w, -r1.y
    add r1.y, -r0.w, -c242.y
    mul_sat r1.y, -r1.x, r1.y
    max r1.x, -r1.x, -c242.w
    min o8.w, r0.w, r1.x
    add r1.x, -r1.y, -c242.y
    mov r2.xyz, c9
    add r1.yzw, -r2.xxyz, c238.xxyz
    mul r1.xyz, r1.x, r1.yzww
    mad o8.xyz, r0.w, r1, c9
    mad o1.zw, v5.xyxy, c240.xyxy, c240
    mad r1.xy, v4, c239, c239.zwzw
    mul o9.xy, r1, c241
    mov o1.xy, r1
    add r1.xyz, r0, -c7
    mov o3.xyz, r0
    dp3 r0.x, r1, r1
    rsq r0.x, r0.x
    mul o2.xyz, r1, r0.x
    mov o7, -c242.yyww

Pixel shader

ps_3_0
    def c9, 2, -1, 1, 0.5
    def c10, 0, 32, 0, 0
    dcl_texcoord v0.xy
    dcl_texcoord1 v1.xyz
    dcl_texcoord2 v2.xyz
    dcl_texcoord3 v3.xyz
    dcl_texcoord4 v4.xyz
    dcl_texcoord5 v5.xyz
    dcl_texcoord8 v6
    dcl_texcoord9 v7.xy
    dcl_2d s0
    dcl_2d s1
    dcl_2d s2
    texld r0, v7, s1
    mad r0.xy, r0.wyzw, c9.x, c9.y
    mul r1.xyz, r0.y, v4
    mad r1.xyz, r0.x, v3, r1
    dp2add r0.x, r0, -r0, c9.z
    rsq r0.x, r0.x
    rcp r0.x, r0.x
    mad r0.xyz, r0.x, v5, r1
    add r1.xyz, c2, -v2
    dp3 r0.w, r1, r1
    rsq r1.w, r0.w
    mov r2.z, c9.z
    mad r0.w, r0.w, -c2.w, r2.z
    max r2.x, r0.w, c10.x
    add_sat r0.w, r2.x, r2.x
    mul r2.xyw, r0.w, c3.xyzz
    nrm r3.xyz, v1
    mad r4.xyz, r1, r1.w, -r3
    mul r1.xyz, r1, r1.w
    dp3_sat r0.w, r0, r1
    mul r1.xyz, r2.xyww, r0.w
    nrm r5.xyz, r4
    dp3_sat r0.w, r0, r5
    pow_sat r1.w, r0.w, c10.y
    texld r4, v0, s2
    mul r2.xyw, r2, r4.xyzz
    mul r2.xyw, r1.w, r2
    mul r5.xyz, r2.xyww, c3.w
    add r6.xyz, c0, -v2
    dp3 r0.w, r6, r6
    rsq r1.w, r0.w
    mad r0.w, r0.w, -c0.w, r2.z
    max r3.w, r0.w, c10.x
    add_sat r0.w, r3.w, r3.w
    mul r7.xyz, r0.w, c1
    mad r8.xyz, r6, r1.w, -r3
    mul r6.xyz, r6, r1.w
    dp3_sat r0.w, r0, r6
    mul r6.xyz, r7, r0.w
    mul r7.xyz, r4, r7
    nrm r9.xyz, r8
    dp3_sat r0.w, r0, r9
    pow_sat r1.w, r0.w, c10.y
    mul r7.xyz, r7, r1.w
    mad r5.xyz, r7, c1.w, r5
    add r8.xyz, c4, -v2
    dp3 r0.w, r8, r8
    rsq r1.w, r0.w
    mad r0.w, r0.w, -c4.w, r2.z
    max r3.w, r0.w, c10.x
    add_sat r0.w, r3.w, r3.w
    mul r9.xyz, r0.w, c5
    mad r3.xyz, r8, r1.w, -r3
    mul r8.xyz, r8, r1.w
    dp3_sat r0.w, r0, r8
    mul r8.xyz, r9, r0.w
    mul r4.xyz, r4, r9
    nrm r9.xyz, r3
    dp3_sat r0.x, r0, r9
    mad r0.y, r0.y, c9.w, c9.w
    pow_sat r1.w, r0.x, c10.y
    mul r0.xzw, r4.xyyz, r1.w
    mad r3.xyz, r0.xzww, c5.w, r5
    add r1.w, r2.z, -c3.w
    mul r2.xyw, r2, r1.w
    mul r4.xyz, r1, r1.w
    mul r1.xyz, r1, c3.w
    mad r1.xyz, r6, c1.w, r1
    mad r1.xyz, r8, c5.w, r1
    add r1.w, r2.z, -c1.w
    mad r2.xyw, r7.xyzz, r1.w, r2
    mad r4.xyz, r6, r1.w, r4
    add r1.w, r2.z, -c5.w
    mad r0.xzw, r0, r1.w, r2.xyyw
    mad r2.xyz, r8, r1.w, r4
    add r2.xyz, r2, r2
    mad r1.xyz, r1, c9.x, r2
    add r0.xzw, r0, r0
    mad r0.xzw, r3.xyyz, c9.x, r0
    mov r2.xyz, c7
    add r2.xyz, -r2, c6
    mad r2.xyz, r0.y, r2, c7
    add r1.xyz, r1, r2
    mad r1.xyz, c8, r4.w, r1
    texld r2, v0, s0
    mad r0.xyz, r2, r1, r0.xzww
    mov oC0.w, r2.w
    add r0.xyz, r0, -v6
    mad oC0.xyz, v6.w, r0, v6

// approximately 104 instruction slots used (3 texture, 101 arithmetic)