{"payload":{"feedbackUrl":"https://github.com/orgs/community/discussions/53140","repo":{"id":744291556,"defaultBranch":"main","name":"channels-last-groupnorm","ownerLogin":"BearNinja123","currentUserCanPush":false,"isFork":false,"isEmpty":false,"createdAt":"2024-01-17T01:54:38.000Z","ownerAvatar":"https://avatars.githubusercontent.com/u/75278258?v=4","public":true,"private":false,"isOrgOwned":false},"refInfo":{"name":"","listCacheKey":"v0:1707378020.0","currentOid":""},"activityList":{"items":[{"before":"d6b8915eb324a7a955dc6d75753777753e45eccf","after":"dbc01a9585b1106c7596e7f8fd909003c0d406cc","ref":"refs/heads/main","pushedAt":"2024-05-08T21:19:30.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"modified code to be more consistent with GN writeup","shortMessageHtmlLink":"modified code to be more consistent with GN writeup"}},{"before":"3ab5950bdf18ef51056a4162d61add7b4bca3006","after":"d6b8915eb324a7a955dc6d75753777753e45eccf","ref":"refs/heads/main","pushedAt":"2024-02-25T20:25:56.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"nvm gnNHWC is 50% faster than triton GN, it really is that only 2% of SD is group_norm (based on nsight-sys profiling)","shortMessageHtmlLink":"nvm gnNHWC is 50% faster than triton GN, it really is that only 2% of…"}},{"before":"3118218e779ff7972668d10346b513a501f38333","after":"3ab5950bdf18ef51056a4162d61add7b4bca3006","ref":"refs/heads/main","pushedAt":"2024-02-21T16:39:32.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"update README","shortMessageHtmlLink":"update README"}},{"before":"eeffa8a9fb15c5851fe5dd42697c7a6773dead29","after":"3118218e779ff7972668d10346b513a501f38333","ref":"refs/heads/main","pushedAt":"2024-02-20T23:47:18.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"fix GN error checking yet again, redo tests (cuda GN only 5-10% faster than triton)","shortMessageHtmlLink":"fix GN error checking yet again, redo tests (cuda GN only 5-10% faste…"}},{"before":"4a3cbb03f49120c94de9e4f5957080f278d060cd","after":"eeffa8a9fb15c5851fe5dd42697c7a6773dead29","ref":"refs/heads/main","pushedAt":"2024-02-20T22:53:24.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"added gelu/gelu tanh activations","shortMessageHtmlLink":"added gelu/gelu tanh activations"}},{"before":"3e7d261fbe0ef55b22d4aa6079eaaf5517c87302","after":"4a3cbb03f49120c94de9e4f5957080f278d060cd","ref":"refs/heads/main","pushedAt":"2024-02-20T22:14:05.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"template activation function speeds up elementwise kernels more than i expected","shortMessageHtmlLink":"template activation function speeds up elementwise kernels more than …"}},{"before":"cd99154dda270426307221eda8c422deea65c06e","after":"3e7d261fbe0ef55b22d4aa6079eaaf5517c87302","ref":"refs/heads/main","pushedAt":"2024-02-20T16:55:07.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"minor tweaks to test compilation speed","shortMessageHtmlLink":"minor tweaks to test compilation speed"}},{"before":"9f416706a133a599672267d077f77d7143a0d498","after":"cd99154dda270426307221eda8c422deea65c06e","ref":"refs/heads/main","pushedAt":"2024-02-20T04:55:42.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"change README to include A100 test","shortMessageHtmlLink":"change README to include A100 test"}},{"before":"10675bfb73bf9c2feab7ffc1c33f38c163019136","after":"9f416706a133a599672267d077f77d7143a0d498","ref":"refs/heads/main","pushedAt":"2024-02-19T17:29:03.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"fix readme","shortMessageHtmlLink":"fix readme"}},{"before":"506d08af1ac10e3ad237e1491092e06ac531f7c2","after":"10675bfb73bf9c2feab7ffc1c33f38c163019136","ref":"refs/heads/main","pushedAt":"2024-02-19T17:27:22.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"Python wrapper + more documentation","shortMessageHtmlLink":"Python wrapper + more documentation"}},{"before":"3f8c20bb2bff4e4703741497171644eb152d39ce","after":"506d08af1ac10e3ad237e1491092e06ac531f7c2","ref":"refs/heads/main","pushedAt":"2024-02-12T18:56:54.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"update README","shortMessageHtmlLink":"update README"}},{"before":"dc850e9aaf86a48510d2586b1509c06b17289ea0","after":"3f8c20bb2bff4e4703741497171644eb152d39ce","ref":"refs/heads/main","pushedAt":"2024-02-11T06:28:31.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"fixing some more issues","shortMessageHtmlLink":"fixing some more issues"}},{"before":"3128cc6c191f8f064ee49a06e549f686095347b5","after":"dc850e9aaf86a48510d2586b1509c06b17289ea0","ref":"refs/heads/main","pushedAt":"2024-02-11T01:49:25.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"fused activation bwd works","shortMessageHtmlLink":"fused activation bwd works"}},{"before":"f34c886753e5556307d07468e930f4767f1941cb","after":"3128cc6c191f8f064ee49a06e549f686095347b5","ref":"refs/heads/main","pushedAt":"2024-02-09T19:52:35.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"cuda graphs works, custom NHWC GN kernel should be a drop-in replacement for stable-fast Triton GN and runs SD 1% faster for batch size = 1","shortMessageHtmlLink":"cuda graphs works, custom NHWC GN kernel should be a drop-in replacem…"}},{"before":"921d5794394efdbd40826d664a1e403795619590","after":"f34c886753e5556307d07468e930f4767f1941cb","ref":"refs/heads/main","pushedAt":"2024-02-09T09:55:52.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"remove at::Tensor from custom cuda kernels -> 2x faster compilation speed (10-15x faster compilation than nchw_kernel.cu) + fixing more bugs","shortMessageHtmlLink":"remove at::Tensor from custom cuda kernels -> 2x faster compilation s…"}},{"before":"2b1b490ac06785577316ad032e506a09a830341c","after":"921d5794394efdbd40826d664a1e403795619590","ref":"refs/heads/main","pushedAt":"2024-02-08T09:15:27.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"minor tweaks","shortMessageHtmlLink":"minor tweaks"}},{"before":"7bca37aa5f525990cbaba3de7c28b61850729f83","after":"2b1b490ac06785577316ad032e506a09a830341c","ref":"refs/heads/main","pushedAt":"2024-02-08T07:47:48.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"deleted kernels that weren't that helpful for performance","shortMessageHtmlLink":"deleted kernels that weren't that helpful for performance"}},{"before":null,"after":"7bca37aa5f525990cbaba3de7c28b61850729f83","ref":"refs/heads/extra-kernels","pushedAt":"2024-02-08T07:40:20.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"bwd works with all input sizes","shortMessageHtmlLink":"bwd works with all input sizes"}},{"before":"68fcac97d606e1a2aab19f5b3880e0172d3925cf","after":"7bca37aa5f525990cbaba3de7c28b61850729f83","ref":"refs/heads/main","pushedAt":"2024-02-08T07:35:59.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"bwd works with all input sizes","shortMessageHtmlLink":"bwd works with all input sizes"}},{"before":"d5162cc2423329582cdbca14716180f7d96b88c9","after":"68fcac97d606e1a2aab19f5b3880e0172d3925cf","ref":"refs/heads/main","pushedAt":"2024-02-07T01:14:46.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"making fused impl faster (faster, but doesn't help in real world cases) + micro optimizing NH","shortMessageHtmlLink":"making fused impl faster (faster, but doesn't help in real world case…"}},{"before":"294f01e0e6c3e6931ceb7a4b1739d0981d4e39c9","after":"d5162cc2423329582cdbca14716180f7d96b88c9","ref":"refs/heads/main","pushedAt":"2024-02-06T01:41:08.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"made fwd work for all(?) shape inputs","shortMessageHtmlLink":"made fwd work for all(?) shape inputs"}},{"before":"73f7d3b0f15b530cdb4ea42e42429dbe9e36fbae","after":"294f01e0e6c3e6931ceb7a4b1739d0981d4e39c9","ref":"refs/heads/main","pushedAt":"2024-02-05T06:55:29.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"GN fwd works with more input shapes (non-powers of 2), works in SD","shortMessageHtmlLink":"GN fwd works with more input shapes (non-powers of 2), works in SD"}},{"before":"81f7e02f72bf6f40f55cee78a9d088b9834db963","after":"73f7d3b0f15b530cdb4ea42e42429dbe9e36fbae","ref":"refs/heads/main","pushedAt":"2024-02-05T00:26:20.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"minor changes, removed some includes from nchw_kernel (still compiles 5x slower than nhwc kernel)","shortMessageHtmlLink":"minor changes, removed some includes from nchw_kernel (still compiles…"}},{"before":"0cdc013799f267588513ac2c9acf940827a1444e","after":"81f7e02f72bf6f40f55cee78a9d088b9834db963","ref":"refs/heads/main","pushedAt":"2024-02-05T00:06:13.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"tried vectorizing spatial loop, didn't work","shortMessageHtmlLink":"tried vectorizing spatial loop, didn't work"}},{"before":"40ee15527db24e059e6a9b125273b9f94a8f8166","after":"0cdc013799f267588513ac2c9acf940827a1444e","ref":"refs/heads/main","pushedAt":"2024-02-04T06:36:44.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"rewrote generic elementwise kernels into faster custom kernels for fwd + bwd, fixed a bug in sum reducing regarding syncthreads","shortMessageHtmlLink":"rewrote generic elementwise kernels into faster custom kernels for fw…"}},{"before":"e041bf5abfef16dea1f4866f35bf3fba0725920c","after":"40ee15527db24e059e6a9b125273b9f94a8f8166","ref":"refs/heads/main","pushedAt":"2024-02-03T22:26:16.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"converted elementwise kernel into custom kernel (fwd) -> profit (~7% speedup on RF++ fwd at small batch size)","shortMessageHtmlLink":"converted elementwise kernel into custom kernel (fwd) -> profit (~7% …"}},{"before":"2a774d70b4894e460362a0a083d5a579b342e287","after":"e041bf5abfef16dea1f4866f35bf3fba0725920c","ref":"refs/heads/main","pushedAt":"2024-02-03T07:50:39.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"debloating (also fused two kernels calculating coefs in bwd)","shortMessageHtmlLink":"debloating (also fused two kernels calculating coefs in bwd)"}},{"before":"9acf38468cf86a7310e60763b91aa353cedff38c","after":"2a774d70b4894e460362a0a083d5a579b342e287","ref":"refs/heads/main","pushedAt":"2024-02-02T22:06:06.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"using NCH output buffer for [x]dy_sum_data (bwd) and NGH output buffer for welford data buffer (fwd) (instead of NHC/NHG), NHWC restoreformer ~20% faster (fwd+bwd) than NCHW restoreformer with nn groupnorm","shortMessageHtmlLink":"using NCH output buffer for [x]dy_sum_data (bwd) and NGH output buffe…"}},{"before":"bc22988b43a00bddc72f1d190dc3ad6ea5e81643","after":"9acf38468cf86a7310e60763b91aa353cedff38c","ref":"refs/heads/main","pushedAt":"2024-02-02T16:03:15.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"bwd NH kernel done","shortMessageHtmlLink":"bwd NH kernel done"}},{"before":"8ae8bdd23d7c5f8998d057bb4db20424a49f9d5c","after":"bc22988b43a00bddc72f1d190dc3ad6ea5e81643","ref":"refs/heads/main","pushedAt":"2024-02-01T23:52:51.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"BearNinja123","name":null,"path":"/BearNinja123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/75278258?s=80&v=4"},"commit":{"message":"working bwd kernel (unoptimized, poorly tested)","shortMessageHtmlLink":"working bwd kernel (unoptimized, poorly tested)"}}],"hasNextPage":true,"hasPreviousPage":false,"activityType":"all","actor":null,"timePeriod":"all","sort":"DESC","perPage":30,"cursor":"djE6ks8AAAAERQ0ClwA","startCursor":null,"endCursor":null}},"title":"Activity · BearNinja123/channels-last-groupnorm"}